diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index ef21699476..d99be87a47 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -18,7 +18,7 @@ jobs: - name: Check Python code with Black uses: psf/black@stable with: - version: 23.9.1 + version: 24.2.0 options: --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py' src: sdk/ diff --git a/examples/pytorch/image-classification/Train CNN with FashionMNIST.ipynb b/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb similarity index 100% rename from examples/pytorch/image-classification/Train CNN with FashionMNIST.ipynb rename to examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb diff --git a/examples/pytorch/text-classification/Fine Tune BERT LLM.ipynb b/examples/pytorch/text-classification/Fine Tune BERT LLM.ipynb deleted file mode 100644 index bf10215ad0..0000000000 --- a/examples/pytorch/text-classification/Fine Tune BERT LLM.ipynb +++ /dev/null @@ -1,683 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fine-Tune BERT LLM for Sentiment Analysis with Kubeflow PyTorchJob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This Notebook will fine-tune Bidirectional Encoder Representations from Transformers (BERT) model with Yelp dataset to analyze text sentiment using distributed training with [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/overview/).\n", - "\n", - "Pretrained BERT model: https://huggingface.co/google-bert/bert-base-cased\n", - "\n", - "Yelp review full dataset: https://huggingface.co/datasets/yelp_review_full\n", - "\n", - "This Notebook requires:\n", - "\n", - "- At least **3 GPU** on your Kubernetes cluster to fine-tune BERT model on 3 workers.\n", - "- AWS S3 bucket to export fine-tuned model.\n", - "\n", - "This example is based on [the HuggingFace fine-tuning tutorial](https://huggingface.co/docs/transformers/en/training)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Install required packages\n", - "\n", - "We need to install HuggingFace packages to run this Notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install transformers datasets boto3\n", - "\n", - "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Get samples from Yelp reviews dataset\n", - "\n", - "The Yelp reviews full star dataset is constructed by randomly taking 130,000 training samples and 10,000 testing samples for each review star from 1 to 5.\n", - "\n", - "In total there are 650,000 training samples and 50,000 testing samples.\n", - "\n", - "We are going to use this dataset to fine-tune BERT model." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:45:19.125747Z", - "iopub.status.busy": "2024-03-10T00:45:19.125051Z", - "iopub.status.idle": "2024-03-10T00:45:21.775181Z", - "shell.execute_reply": "2024-03-10T00:45:21.774143Z", - "shell.execute_reply.started": "2024-03-10T00:45:19.125725Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'label': 4,\n", - " 'text': \"Top notch doctor in a top notch practice. Can't say I am surprised \"\n", - " 'when I was referred to him by another doctor who I think is '\n", - " 'wonderful and because he went to one of the best medical schools in '\n", - " 'the country. \\\\nIt is really easy to get an appointment. There is '\n", - " 'minimal wait to be seen and his bedside manner is great.'}\n", - "{'label': 1,\n", - " 'text': 'Average run of the mill store. Associates are young teens and they '\n", - " \"really don't know where anything is. Luckily I am able to get \"\n", - " 'around to find everything. Found my puppy treats and moved on.'}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "from datasets import load_dataset\n", - "\n", - "# Test only 100 samples in the Notebook.\n", - "dataset = load_dataset(\"yelp_review_full\", split=\"train[:100]\")\n", - "\n", - "# Print some test data.\n", - "pprint(dataset[5])\n", - "pprint(dataset[30])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create script to fine-tune BERT model\n", - "\n", - "We need to wrap our fine-tuning script in a function to create Kubeflow PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:37:51.012597Z", - "iopub.status.busy": "2024-03-10T00:37:51.012357Z", - "iopub.status.idle": "2024-03-10T00:37:51.021633Z", - "shell.execute_reply": "2024-03-10T00:37:51.020711Z", - "shell.execute_reply.started": "2024-03-10T00:37:51.012581Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "def train_func(parameters):\n", - " import os\n", - "\n", - " import boto3\n", - " import evaluate\n", - " import numpy as np\n", - " from datasets import load_dataset\n", - " from datasets.distributed import split_dataset_by_node\n", - " from transformers import (\n", - " AutoModelForSequenceClassification,\n", - " AutoTokenizer,\n", - " Trainer,\n", - " TrainingArguments,\n", - " )\n", - "\n", - " # [1] Download BERT model, tokenizer, and Yelp dataset.\n", - " print(\"-\" * 40)\n", - " print(\"Download BERT Model\")\n", - " model = AutoModelForSequenceClassification.from_pretrained(\n", - " \"bert-base-cased\",\n", - " num_labels=5,\n", - " )\n", - " tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", - "\n", - " print(\"-\" * 40)\n", - " print(\"Download Yelp Review Dataset\")\n", - "\n", - " # Use only 4000 data samples to reduce tokenization and training time.\n", - " # Training samples - 3600, test samples - 400\n", - " # Remove split to take all samples: dataset = load_dataset(\"yelp_review_full\")\n", - " dataset = load_dataset(\"yelp_review_full\", split=\"train[:4000]\")\n", - " dataset = dataset.train_test_split(test_size=0.1, stratify_by_column=\"label\")\n", - "\n", - " # [2] Preprocess dataset.\n", - " def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", - "\n", - " # Map Yelp review dataset to BERT tokenizer.\n", - " print(\"-\" * 40)\n", - " print(\"Map Yelp review dataset to BERT Tokenizer\")\n", - " tokenized_ds = dataset.map(tokenize_function, batched=True)\n", - "\n", - " # Distribute train and test datasets between PyTorch workers.\n", - " # Every worker will process chunk of training data.\n", - " # RANK and WORLD_SIZE will be set by Kubeflow Training Operator.\n", - " RANK = int(os.environ[\"RANK\"])\n", - " WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n", - " distributed_ds_train = split_dataset_by_node(\n", - " tokenized_ds[\"train\"],\n", - " rank=RANK,\n", - " world_size=WORLD_SIZE,\n", - " )\n", - " distributed_ds_test = split_dataset_by_node(\n", - " tokenized_ds[\"test\"],\n", - " rank=RANK,\n", - " world_size=WORLD_SIZE,\n", - " )\n", - "\n", - " # Evaluate accuracy.\n", - " metric = evaluate.load(\"accuracy\")\n", - "\n", - " def compute_metrics(eval_pred):\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", - "\n", - " # [3] Define Training args.\n", - " training_args = TrainingArguments(\n", - " output_dir=\"test_trainer\",\n", - " evaluation_strategy=\"epoch\",\n", - " disable_tqdm=True,\n", - " log_level=\"info\",\n", - " )\n", - "\n", - " # [4] Define Trainer.\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=distributed_ds_train,\n", - " eval_dataset=distributed_ds_test,\n", - " compute_metrics=compute_metrics,\n", - " )\n", - "\n", - " # [5] Fine-tune model.\n", - " print(\"-\" * 40)\n", - " print(f\"Start Distributed Training. RANK: {RANK} WORLD_SIZE: {WORLD_SIZE}\")\n", - "\n", - " trainer.train()\n", - "\n", - " print(\"-\" * 40)\n", - " print(\"Training is complete\")\n", - "\n", - " # [6] Export trained model to S3 from the worker with RANK = 0.\n", - " if RANK == 0:\n", - " trainer.save_model(\"./bert\")\n", - " s3 = boto3.resource(\"s3\")\n", - " bucket = s3.Bucket(parameters[\"BUCKET\"])\n", - " bucket.upload_file(\"bert/config.json\", \"bert/config.json\")\n", - " bucket.upload_file(\"bert/model.safetensors\", \"bert/model.safetensors\")\n", - "\n", - " print(\"-\" * 40)\n", - " print(\"Model is exported to S3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Kubeflow PyTorchJob to fine-tune BERT on GPUs\n", - "\n", - "Use `TrainingClient()` to create PyTorchJob which will fine-tune BERT on **3 workers** using **1 GPU** for each worker.\n", - "\n", - "Your Kubernetes cluster should have sufficient **GPU** resources available." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:37:52.743447Z", - "iopub.status.busy": "2024-03-10T00:37:52.743202Z", - "iopub.status.idle": "2024-03-10T00:37:52.749400Z", - "shell.execute_reply": "2024-03-10T00:37:52.747484Z", - "shell.execute_reply.started": "2024-03-10T00:37:52.743430Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import uuid\n", - "\n", - "# Make random name for PyTorchJob\n", - "job_name = \"fine-tune-bert-\" + str(uuid.uuid4())[:5]\n", - "\n", - "# Replace `BUCKET_NAME` with your AWS S3 bucket.\n", - "bucket = \"BUCKET_NAME\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:37:54.673961Z", - "iopub.status.busy": "2024-03-10T00:37:54.673715Z", - "iopub.status.idle": "2024-03-10T00:37:54.849353Z", - "shell.execute_reply": "2024-03-10T00:37:54.847915Z", - "shell.execute_reply.started": "2024-03-10T00:37:54.673944Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from kubeflow.training import TrainingClient\n", - "\n", - "# Create PyTorchJob\n", - "TrainingClient().create_job(\n", - " name=job_name,\n", - " train_func=train_func,\n", - " parameters={\"BUCKET\": bucket},\n", - " num_workers=3, # Number of PyTorch workers to use.\n", - " resources_per_worker={\n", - " \"cpu\": \"4\",\n", - " \"memory\": \"10G\",\n", - " \"gpu\": \"1\",\n", - " },\n", - " packages_to_install=[\n", - " \"boto3\",\n", - " \"transformers\",\n", - " \"datasets\",\n", - " \"evaluate\",\n", - " \"accelerate\",\n", - " \"scikit-learn\",\n", - " ], # PIP packages will be installed during PyTorchJob runtime.\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Check the PyTorchJob conditions\n", - "\n", - "Use `TrainingClient()` APIs to get information about created PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:37:58.701682Z", - "iopub.status.busy": "2024-03-10T00:37:58.701338Z", - "iopub.status.idle": "2024-03-10T00:37:58.747460Z", - "shell.execute_reply": "2024-03-10T00:37:58.746536Z", - "shell.execute_reply.started": "2024-03-10T00:37:58.701664Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PyTorchJob Conditions\n", - "[{'last_transition_time': datetime.datetime(2024, 3, 10, 0, 37, 54, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 10, 0, 37, 54, tzinfo=tzlocal()),\n", - " 'message': 'PyTorchJob fine-tune-bert-1a883 is created.',\n", - " 'reason': 'PyTorchJobCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'}, {'last_transition_time': datetime.datetime(2024, 3, 10, 0, 37, 56, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2024, 3, 10, 0, 37, 56, tzinfo=tzlocal()),\n", - " 'message': 'PyTorchJob fine-tune-bert-1a883 is running.',\n", - " 'reason': 'PyTorchJobRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}]\n", - "----------------------------------------\n", - "PyTorchJob is running\n" - ] - } - ], - "source": [ - "print(\"PyTorchJob Conditions\")\n", - "print(TrainingClient().get_job_conditions(job_name))\n", - "print(\"-\" * 40)\n", - "\n", - "# Wait until PyTorchJob has Running condition.\n", - "job = TrainingClient().wait_for_job_conditions(\n", - " job_name,\n", - " expected_conditions={\"Running\"},\n", - ")\n", - "print(\"PyTorchJob is running\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get the PyTorchJob pod names\n", - "\n", - "Since we set 3 workers, PyTorchJob will create 1 master pod and 2 worker pods to execute distributed training." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:38:02.257947Z", - "iopub.status.busy": "2024-03-10T00:38:02.257697Z", - "iopub.status.idle": "2024-03-10T00:38:02.307198Z", - "shell.execute_reply": "2024-03-10T00:38:02.306329Z", - "shell.execute_reply.started": "2024-03-10T00:38:02.257930Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['fine-tune-bert-1a883-master-0',\n", - " 'fine-tune-bert-1a883-worker-0',\n", - " 'fine-tune-bert-1a883-worker-1']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TrainingClient().get_job_pod_names(job_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": { - "iopub.status.busy": "2022-09-01T20:10:25.759950Z", - "iopub.status.idle": "2022-09-01T20:10:25.760581Z", - "shell.execute_reply": "2022-09-01T20:10:25.760353Z", - "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z" - }, - "tags": [] - }, - "source": [ - "### Get the PyTorchJob training logs\n", - "\n", - "Every worker processes 1200 training samples on each epoch since we distribute 3600 training samples across 3 workers." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:38:05.788903Z", - "iopub.status.busy": "2024-03-10T00:38:05.788625Z", - "iopub.status.idle": "2024-03-10T00:40:25.904118Z", - "shell.execute_reply": "2024-03-10T00:40:25.903020Z", - "shell.execute_reply.started": "2024-03-10T00:38:05.788883Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Pod fine-tune-bert-1a883-master-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Download BERT Model\n", - "[Pod fine-tune-bert-1a883-master-0]: Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "[Pod fine-tune-bert-1a883-master-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Download Yelp Review Dataset\n", - "Downloading readme: 100%|██████████| 6.72k/6.72k [00:00<00:00, 26.2MB/s]\n", - "Downloading data: 100%|██████████| 299M/299M [00:05<00:00, 57.4MB/s] \n", - "Downloading data: 100%|██████████| 23.5M/23.5M [00:00<00:00, 45.3MB/s]\n", - "Generating train split: 100%|██████████| 650000/650000 [00:01<00:00, 371416.73 examples/s]\n", - "Generating test split: 100%|██████████| 50000/50000 [00:00<00:00, 363106.11 examples/s]\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Map Yelp review dataset to BERT Tokenizer\n", - "Map: 100%|██████████| 3600/3600 [00:01<00:00, 2464.94 examples/s]\n", - "Map: 100%|██████████| 400/400 [00:00<00:00, 2553.52 examples/s]\n", - "Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 16.6MB/s]\n", - "[Pod fine-tune-bert-1a883-master-0]: /opt/conda/lib/python3.10/site-packages/accelerate/state.py:306: UserWarning: OMP_NUM_THREADS/MKL_NUM_THREADS unset, we set it at 16 to improve oob performance.\n", - "[Pod fine-tune-bert-1a883-master-0]: warnings.warn(\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Start Distributed Training. RANK: 0 WORLD_SIZE: 3\n", - "[Pod fine-tune-bert-1a883-master-0]: The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", - "[Pod fine-tune-bert-1a883-master-0]: ***** Running training *****\n", - "[Pod fine-tune-bert-1a883-master-0]: Num examples = 1,200\n", - "[Pod fine-tune-bert-1a883-master-0]: Num Epochs = 3\n", - "[Pod fine-tune-bert-1a883-master-0]: Instantaneous batch size per device = 8\n", - "[Pod fine-tune-bert-1a883-master-0]: Total train batch size (w. parallel, distributed & accumulation) = 24\n", - "[Pod fine-tune-bert-1a883-master-0]: Gradient Accumulation steps = 1\n", - "[Pod fine-tune-bert-1a883-master-0]: Total optimization steps = 150\n", - "[Pod fine-tune-bert-1a883-master-0]: Number of trainable parameters = 108,314,117\n", - "[Pod fine-tune-bert-1a883-master-0]: [W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "[Pod fine-tune-bert-1a883-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", - "[Pod fine-tune-bert-1a883-master-0]: ***** Running Evaluation *****\n", - "[Pod fine-tune-bert-1a883-master-0]: Num examples = 134\n", - "[Pod fine-tune-bert-1a883-master-0]: Batch size = 8\n", - "[Pod fine-tune-bert-1a883-master-0]: {'eval_loss': 1.2028350830078125, 'eval_accuracy': 0.4925373134328358, 'eval_runtime': 0.5392, 'eval_samples_per_second': 248.532, 'eval_steps_per_second': 11.128, 'epoch': 1.0}\n", - "[Pod fine-tune-bert-1a883-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", - "[Pod fine-tune-bert-1a883-master-0]: ***** Running Evaluation *****\n", - "[Pod fine-tune-bert-1a883-master-0]: Num examples = 134\n", - "[Pod fine-tune-bert-1a883-master-0]: Batch size = 8\n", - "[Pod fine-tune-bert-1a883-master-0]: {'eval_loss': 0.9666597843170166, 'eval_accuracy': 0.5895522388059702, 'eval_runtime': 0.5656, 'eval_samples_per_second': 236.909, 'eval_steps_per_second': 10.608, 'epoch': 2.0}\n", - "[Pod fine-tune-bert-1a883-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", - "[Pod fine-tune-bert-1a883-master-0]: ***** Running Evaluation *****\n", - "[Pod fine-tune-bert-1a883-master-0]: Num examples = 134\n", - "[Pod fine-tune-bert-1a883-master-0]: Batch size = 8\n", - "[Pod fine-tune-bert-1a883-master-0]: {'eval_loss': 0.852095901966095, 'eval_accuracy': 0.6268656716417911, 'eval_runtime': 0.5951, 'eval_samples_per_second': 225.172, 'eval_steps_per_second': 10.082, 'epoch': 3.0}\n", - "[Pod fine-tune-bert-1a883-master-0]: Training completed. Do not forget to share your model on huggingface.co/models =)\n", - "[Pod fine-tune-bert-1a883-master-0]: {'train_runtime': 73.6766, 'train_samples_per_second': 48.862, 'train_steps_per_second': 2.036, 'train_loss': 1.166010030110677, 'epoch': 3.0}\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Training is complete\n", - "[Pod fine-tune-bert-1a883-master-0]: Saving model checkpoint to ./bert\n", - "[Pod fine-tune-bert-1a883-master-0]: Configuration saved in ./bert/config.json\n", - "[Pod fine-tune-bert-1a883-master-0]: Model weights saved in ./bert/model.safetensors\n", - "[Pod fine-tune-bert-1a883-master-0]: ----------------------------------------\n", - "[Pod fine-tune-bert-1a883-master-0]: Model is exported to S3\n" - ] - } - ], - "source": [ - "logs, _ = TrainingClient().get_job_logs(job_name, follow=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download the fine-tuned model\n", - "\n", - "We can download our fine-tuned BERT model from S3 to evaluate it." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:41:32.463113Z", - "iopub.status.busy": "2024-03-10T00:41:32.462861Z", - "iopub.status.idle": "2024-03-10T00:41:34.615767Z", - "shell.execute_reply": "2024-03-10T00:41:34.615101Z", - "shell.execute_reply.started": "2024-03-10T00:41:32.463095Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import boto3\n", - "\n", - "s3 = boto3.resource(\"s3\")\n", - "bucket = s3.Bucket(bucket)\n", - "\n", - "# config.json is the model metadata.\n", - "# model.safetensors is the model weights & biases.\n", - "if not os.path.exists(\"bert\"):\n", - " os.makedirs(\"bert\")\n", - "bucket.download_file(\"bert/config.json\", \"bert/config.json\")\n", - "bucket.download_file(\"bert/model.safetensors\", \"bert/model.safetensors\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Test the fine-tuned BERT model\n", - "\n", - "We are going to use HuggingFace pipeline to test our model.\n", - "\n", - "We will ask for sentiment analysis task for our fine-tuned LLM." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:43:29.026194Z", - "iopub.status.busy": "2024-03-10T00:43:29.025948Z", - "iopub.status.idle": "2024-03-10T00:43:29.651226Z", - "shell.execute_reply": "2024-03-10T00:43:29.650644Z", - "shell.execute_reply.started": "2024-03-10T00:43:29.026177Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This is one of the best restaurants I've ever been to.\n", - "Star: 4\n", - "Score: 0.8029219508171082\n", - "---------------------------\n", - "\n", - "\n", - "I am upset by using this service. It is very expensive and quality is bad.\n", - "Star: 1\n", - "Score: 0.5185158848762512\n", - "---------------------------\n" - ] - } - ], - "source": [ - "from transformers import AutoTokenizer, pipeline\n", - "\n", - "# During fine-tuning BERT tokenizer is not changed.\n", - "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", - "\n", - "# Use pipeline with sentiment-analysis task to evaluate our model.\n", - "nlp = pipeline(\"sentiment-analysis\", model=\"./bert\", tokenizer=tokenizer)\n", - "\n", - "good_review = \"This is one of the best restaurants I've ever been to.\"\n", - "bad_review = \"I am upset by using this service. It is very expensive and quality is bad.\"\n", - "\n", - "print(good_review)\n", - "res = nlp(good_review)\n", - "\n", - "print(\"Star: \", res[0][\"label\"][6])\n", - "print(\"Score: \", res[0][\"score\"])\n", - "print(\"---------------------------\\n\\n\")\n", - "\n", - "\n", - "print(bad_review)\n", - "res = nlp(bad_review)\n", - "\n", - "print(\"Star: \", res[0][\"label\"][6])\n", - "print(\"Score: \", res[0][\"score\"])\n", - "print(\"---------------------------\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-01T23:44:15.511173Z", - "iopub.status.busy": "2024-03-01T23:44:15.510932Z", - "iopub.status.idle": "2024-03-01T23:44:15.539921Z", - "shell.execute_reply": "2024-03-01T23:44:15.539352Z", - "shell.execute_reply.started": "2024-03-01T23:44:15.511155Z" - }, - "tags": [] - }, - "source": [ - "## Delete the PyTorchJob\n", - "\n", - "When PyTorchJob is finished, you can delete the resource." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-10T00:43:41.129972Z", - "iopub.status.busy": "2024-03-10T00:43:41.129720Z", - "iopub.status.idle": "2024-03-10T00:43:41.157373Z", - "shell.execute_reply": "2024-03-10T00:43:41.156125Z", - "shell.execute_reply.started": "2024-03-10T00:43:41.129955Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "TrainingClient().delete_job(name=job_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/examples/pytorch/text-classification/Fine-Tune-BERT-LLM.ipynb b/examples/pytorch/text-classification/Fine-Tune-BERT-LLM.ipynb new file mode 100644 index 0000000000..58778727c4 --- /dev/null +++ b/examples/pytorch/text-classification/Fine-Tune-BERT-LLM.ipynb @@ -0,0 +1,882 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-Tune BERT LLM for Sentiment Analysis with Kubeflow PyTorchJob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This Notebook will fine-tune Bidirectional Encoder Representations from Transformers (BERT) model with Yelp dataset to analyze text sentiment using distributed training with [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/overview/).\n", + "\n", + "Pretrained BERT model: https://huggingface.co/google-bert/bert-base-cased\n", + "\n", + "Yelp review full dataset: https://huggingface.co/datasets/yelp_review_full\n", + "\n", + "This Notebook requires:\n", + "\n", + "- At least **3 GPU** on your Kubernetes cluster to fine-tune BERT model on 3 workers.\n", + "- AWS S3 bucket to export fine-tuned model.\n", + "\n", + "This example is based on [the HuggingFace fine-tuning tutorial](https://huggingface.co/docs/transformers/en/training)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Install required packages\n", + "\n", + "We need to install HuggingFace packages to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install transformers datasets boto3\n", + "\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Get samples from Yelp reviews dataset\n", + "\n", + "The Yelp reviews full star dataset is constructed by randomly taking 130,000 training samples and 10,000 testing samples for each review star from 1 to 5.\n", + "\n", + "In total there are 650,000 training samples and 50,000 testing samples.\n", + "\n", + "We are going to use this dataset to fine-tune BERT model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'label': 4,\n", + " 'text': \"Top notch doctor in a top notch practice. Can't say I am surprised \"\n", + " 'when I was referred to him by another doctor who I think is '\n", + " 'wonderful and because he went to one of the best medical schools in '\n", + " 'the country. \\\\nIt is really easy to get an appointment. There is '\n", + " 'minimal wait to be seen and his bedside manner is great.'}\n", + "{'label': 1,\n", + " 'text': 'Average run of the mill store. Associates are young teens and they '\n", + " \"really don't know where anything is. Luckily I am able to get \"\n", + " 'around to find everything. Found my puppy treats and moved on.'}\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "# Test only 100 samples in the Notebook.\n", + "dataset = load_dataset(\"yelp_review_full\", split=\"train[:100]\")\n", + "\n", + "# Print some test data.\n", + "pprint(dataset[5])\n", + "pprint(dataset[30])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create script to fine-tune BERT model\n", + "\n", + "We need to wrap our fine-tuning script in a function to create Kubeflow PyTorchJob." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def train_func(parameters):\n", + " import os\n", + "\n", + " import boto3\n", + " import evaluate\n", + " import numpy as np\n", + " from datasets import load_dataset\n", + " from datasets.distributed import split_dataset_by_node\n", + " from transformers import (\n", + " AutoModelForSequenceClassification,\n", + " AutoTokenizer,\n", + " Trainer,\n", + " TrainingArguments,\n", + " )\n", + "\n", + " # [1] Download BERT model, tokenizer, and Yelp dataset.\n", + " print(\"-\" * 40)\n", + " print(\"Download BERT Model\")\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"bert-base-cased\",\n", + " num_labels=5,\n", + " )\n", + " tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", + "\n", + " print(\"-\" * 40)\n", + " print(\"Download Yelp Review Dataset\")\n", + "\n", + " # Use only 4000 data samples to reduce tokenization and training time.\n", + " # Training samples - 3600, test samples - 400\n", + " # Remove split to take all samples: dataset = load_dataset(\"yelp_review_full\")\n", + " dataset = load_dataset(\"yelp_review_full\", split=\"train[:4000]\")\n", + " dataset = dataset.train_test_split(test_size=0.1, stratify_by_column=\"label\")\n", + "\n", + " # [2] Preprocess dataset.\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + "\n", + " # Map Yelp review dataset to BERT tokenizer.\n", + " print(\"-\" * 40)\n", + " print(\"Map Yelp review dataset to BERT Tokenizer\")\n", + " tokenized_ds = dataset.map(tokenize_function, batched=True)\n", + "\n", + " # Distribute train and test datasets between PyTorch workers.\n", + " # Every worker will process chunk of training data.\n", + " # RANK and WORLD_SIZE will be set by Kubeflow Training Operator.\n", + " RANK = int(os.environ[\"RANK\"])\n", + " WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n", + " distributed_ds_train = split_dataset_by_node(\n", + " tokenized_ds[\"train\"],\n", + " rank=RANK,\n", + " world_size=WORLD_SIZE,\n", + " )\n", + " distributed_ds_test = split_dataset_by_node(\n", + " tokenized_ds[\"test\"],\n", + " rank=RANK,\n", + " world_size=WORLD_SIZE,\n", + " )\n", + "\n", + " # Evaluate accuracy.\n", + " metric = evaluate.load(\"accuracy\")\n", + "\n", + " def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " # [3] Define Training args.\n", + " training_args = TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " evaluation_strategy=\"epoch\",\n", + " disable_tqdm=True,\n", + " log_level=\"info\",\n", + " )\n", + "\n", + " # [4] Define Trainer.\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=distributed_ds_train,\n", + " eval_dataset=distributed_ds_test,\n", + " compute_metrics=compute_metrics,\n", + " )\n", + "\n", + " # [5] Fine-tune model.\n", + " print(\"-\" * 40)\n", + " print(f\"Start Distributed Training. RANK: {RANK} WORLD_SIZE: {WORLD_SIZE}\")\n", + "\n", + " trainer.train()\n", + "\n", + " print(\"-\" * 40)\n", + " print(\"Training is complete\")\n", + "\n", + " # [6] Export trained model to S3 from the worker with RANK = 0.\n", + " if RANK == 0:\n", + " trainer.save_model(\"./bert\")\n", + " s3 = boto3.resource(\"s3\")\n", + " bucket = s3.Bucket(parameters[\"BUCKET\"])\n", + " bucket.upload_file(\"bert/config.json\", \"bert/config.json\")\n", + " bucket.upload_file(\"bert/model.safetensors\", \"bert/model.safetensors\")\n", + "\n", + " print(\"-\" * 40)\n", + " print(\"Model is exported to S3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Kubeflow PyTorchJob to fine-tune BERT on GPUs\n", + "\n", + "Use `TrainingClient()` to create PyTorchJob which will fine-tune BERT on **3 workers** using **1 GPU** for each worker.\n", + "\n", + "Your Kubernetes cluster should have sufficient **GPU** resources available." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import uuid\n", + "from kubeflow.training import TrainingClient\n", + "\n", + "job_name = \"fine-tune-bert\"\n", + "\n", + "# Replace `kubeflow-examples` with your AWS S3 bucket.\n", + "bucket = \"kubeflow-examples\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Create PyTorchJob\n", + "TrainingClient().create_job(\n", + " name=job_name,\n", + " train_func=train_func,\n", + " parameters={\"BUCKET\": bucket},\n", + " num_workers=3, # Number of PyTorch workers to use.\n", + " resources_per_worker={\n", + " \"cpu\": \"4\",\n", + " \"memory\": \"10G\",\n", + " \"gpu\": \"1\",\n", + " },\n", + " packages_to_install=[\n", + " \"boto3\",\n", + " \"transformers\",\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"accelerate\",\n", + " \"scikit-learn\",\n", + " ], # PIP packages will be installed during PyTorchJob runtime.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Check the PyTorchJob conditions\n", + "\n", + "Use `TrainingClient()` APIs to get information about created PyTorchJob." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorchJob Conditions\n", + "[{'last_transition_time': datetime.datetime(2024, 3, 15, 16, 31, 30, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2024, 3, 15, 16, 31, 30, tzinfo=tzutc()),\n", + " 'message': 'PyTorchJob fine-tune-bert is created.',\n", + " 'reason': 'PyTorchJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'}, {'last_transition_time': datetime.datetime(2024, 3, 15, 16, 31, 31, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2024, 3, 15, 16, 31, 31, tzinfo=tzutc()),\n", + " 'message': 'PyTorchJob fine-tune-bert is running.',\n", + " 'reason': 'PyTorchJobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}]\n", + "----------------------------------------\n", + "PyTorchJob is running\n" + ] + } + ], + "source": [ + "print(\"PyTorchJob Conditions\")\n", + "print(TrainingClient().get_job_conditions(job_name))\n", + "print(\"-\" * 40)\n", + "\n", + "# Wait until PyTorchJob has Running condition.\n", + "job = TrainingClient().wait_for_job_conditions(\n", + " job_name,\n", + " expected_conditions={\"Running\"},\n", + ")\n", + "print(\"PyTorchJob is running\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the PyTorchJob pod names\n", + "\n", + "Since we set 3 workers, PyTorchJob will create 1 master pod and 2 worker pods to execute distributed training." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['fine-tune-bert-master-0',\n", + " 'fine-tune-bert-worker-0',\n", + " 'fine-tune-bert-worker-1']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TrainingClient().get_job_pod_names(job_name)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": { + "iopub.status.busy": "2022-09-01T20:10:25.759950Z", + "iopub.status.idle": "2022-09-01T20:10:25.760581Z", + "shell.execute_reply": "2022-09-01T20:10:25.760353Z", + "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z" + }, + "tags": [] + }, + "source": [ + "### Get the PyTorchJob training logs\n", + "\n", + "Every worker processes 1200 training samples on each epoch since we distribute 3600 training samples across 3 workers." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Pod fine-tune-bert-master-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", + "[Pod fine-tune-bert-master-0]: WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Download BERT Model\n", + "[Pod fine-tune-bert-master-0]: Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "[Pod fine-tune-bert-master-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Download Yelp Review Dataset\n", + "Downloading readme: 100%|██████████| 6.72k/6.72k [00:00<00:00, 30.2MB/s]\n", + "Downloading data: 100%|██████████| 299M/299M [00:05<00:00, 59.7MB/s] \n", + "Downloading data: 100%|██████████| 23.5M/23.5M [00:00<00:00, 51.6MB/s]\n", + "Generating train split: 100%|██████████| 650000/650000 [00:01<00:00, 368141.59 examples/s]\n", + "Generating test split: 100%|██████████| 50000/50000 [00:00<00:00, 360107.08 examples/s]\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Map Yelp review dataset to BERT Tokenizer\n", + "Map: 100%|██████████| 3600/3600 [00:01<00:00, 2452.88 examples/s]\n", + "Map: 100%|██████████| 400/400 [00:00<00:00, 2591.52 examples/s]\n", + "Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 15.9MB/s]\n", + "[Pod fine-tune-bert-master-0]: /opt/conda/lib/python3.10/site-packages/accelerate/state.py:313: UserWarning: OMP_NUM_THREADS/MKL_NUM_THREADS unset, we set it at 16 to improve oob performance.\n", + "[Pod fine-tune-bert-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-master-0]: /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: \n", + "[Pod fine-tune-bert-master-0]: dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)\n", + "[Pod fine-tune-bert-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Start Distributed Training. RANK: 0 WORLD_SIZE: 3\n", + "[Pod fine-tune-bert-master-0]: The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-master-0]: ***** Running training *****\n", + "[Pod fine-tune-bert-master-0]: Num examples = 1,200\n", + "[Pod fine-tune-bert-master-0]: Num Epochs = 3\n", + "[Pod fine-tune-bert-master-0]: Instantaneous batch size per device = 8\n", + "[Pod fine-tune-bert-master-0]: Total train batch size (w. parallel, distributed & accumulation) = 24\n", + "[Pod fine-tune-bert-master-0]: Gradient Accumulation steps = 1\n", + "[Pod fine-tune-bert-master-0]: Total optimization steps = 150\n", + "[Pod fine-tune-bert-master-0]: Number of trainable parameters = 108,314,117\n", + "[Pod fine-tune-bert-master-0]: [W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "[Pod fine-tune-bert-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-master-0]: ***** Running Evaluation *****\n", + "[Pod fine-tune-bert-master-0]: Num examples = 134\n", + "[Pod fine-tune-bert-master-0]: Batch size = 8\n", + "[Pod fine-tune-bert-master-0]: {'eval_loss': 1.0521148443222046, 'eval_accuracy': 0.5746268656716418, 'eval_runtime': 0.5213, 'eval_samples_per_second': 257.033, 'eval_steps_per_second': 11.509, 'epoch': 1.0}\n", + "[Pod fine-tune-bert-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-master-0]: ***** Running Evaluation *****\n", + "[Pod fine-tune-bert-master-0]: Num examples = 134\n", + "[Pod fine-tune-bert-master-0]: Batch size = 8\n", + "[Pod fine-tune-bert-master-0]: {'eval_loss': 0.9855704307556152, 'eval_accuracy': 0.5895522388059702, 'eval_runtime': 0.5239, 'eval_samples_per_second': 255.763, 'eval_steps_per_second': 11.452, 'epoch': 2.0}\n", + "[Pod fine-tune-bert-master-0]: The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-master-0]: ***** Running Evaluation *****\n", + "[Pod fine-tune-bert-master-0]: Num examples = 134\n", + "[Pod fine-tune-bert-master-0]: Batch size = 8\n", + "[Pod fine-tune-bert-master-0]: {'eval_loss': 0.9247522354125977, 'eval_accuracy': 0.6492537313432836, 'eval_runtime': 0.527, 'eval_samples_per_second': 254.259, 'eval_steps_per_second': 11.385, 'epoch': 3.0}\n", + "[Pod fine-tune-bert-master-0]: Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "[Pod fine-tune-bert-master-0]: {'train_runtime': 73.331, 'train_samples_per_second': 49.092, 'train_steps_per_second': 2.046, 'train_loss': 1.0898309326171876, 'epoch': 3.0}\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Training is complete\n", + "[Pod fine-tune-bert-master-0]: Saving model checkpoint to ./bert\n", + "[Pod fine-tune-bert-master-0]: Configuration saved in ./bert/config.json\n", + "[Pod fine-tune-bert-master-0]: Model weights saved in ./bert/model.safetensors\n", + "[Pod fine-tune-bert-master-0]: ----------------------------------------\n", + "[Pod fine-tune-bert-master-0]: Model is exported to S3\n" + ] + } + ], + "source": [ + "logs, _ = TrainingClient().get_job_logs(job_name, follow=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download the fine-tuned model\n", + "\n", + "We can download our fine-tuned BERT model from S3 to evaluate it." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import boto3\n", + "import os\n", + "\n", + "s3 = boto3.resource(\"s3\")\n", + "bucket = s3.Bucket(bucket)\n", + "\n", + "# config.json is the model metadata.\n", + "# model.safetensors is the model weights & biases.\n", + "if not os.path.exists(\"bert\"):\n", + " os.makedirs(\"bert\")\n", + "bucket.download_file(\"bert/config.json\", \"bert/config.json\")\n", + "bucket.download_file(\"bert/model.safetensors\", \"bert/model.safetensors\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Test the fine-tuned BERT model\n", + "\n", + "We are going to use HuggingFace pipeline to test our model.\n", + "\n", + "We will ask for sentiment analysis task for our fine-tuned LLM." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is one of the best restaurants I've ever been to.\n", + "Star: 4\n", + "Score: 0.806443452835083\n", + "---------------------------\n", + "\n", + "\n", + "I am upset by using this service. It is very expensive and quality is bad.\n", + "Star: 1\n", + "Score: 0.6581875085830688\n", + "---------------------------\n" + ] + } + ], + "source": [ + "from transformers import AutoTokenizer, pipeline\n", + "\n", + "# During fine-tuning BERT tokenizer is not changed.\n", + "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", + "\n", + "# Use pipeline with sentiment-analysis task to evaluate our model.\n", + "nlp = pipeline(\"sentiment-analysis\", model=\"./bert\", tokenizer=tokenizer)\n", + "\n", + "good_review = \"This is one of the best restaurants I've ever been to.\"\n", + "bad_review = \"I am upset by using this service. It is very expensive and quality is bad.\"\n", + "\n", + "print(good_review)\n", + "res = nlp(good_review)\n", + "\n", + "print(\"Star: \", res[0][\"label\"][6])\n", + "print(\"Score: \", res[0][\"score\"])\n", + "print(\"---------------------------\\n\\n\")\n", + "\n", + "\n", + "print(bad_review)\n", + "res = nlp(bad_review)\n", + "\n", + "print(\"Star: \", res[0][\"label\"][6])\n", + "print(\"Score: \", res[0][\"score\"])\n", + "print(\"---------------------------\")" + ] + }, + { + "attachments": { + "348c13f1-f7df-4148-9c2e-268c05dc1d16.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Train API to Fine-Tune BERT LLM\n", + "\n", + "Kubeflow Training Operator SDK implements a `train` API to effectively fine-tune LLMs on multiple PyTorchJob workers with required configuration. It uses storage initializer to download pre-trained model and dataset, and distribute it across PyTorchJob workers using shared PVCs. After initialization step, pre-created HuggingFace LLM trainer will be executed on each PyTorchJob worker to fine-tune BERT model.\n", + "\n", + "This feature is in **Development Phase**, please provide your feedback by creating [the GitHub issues](https://github.com/kubeflow/training-operator/issues/new) or by using [the Kubeflow Slack channel #kubeflow-training-operator](https://kubeflow.slack.com/archives/C985VJN9F).\n", + "\n", + "To learn more about it check [this proposal](https://github.com/kubeflow/training-operator/blob/master/docs/proposals/train_api_proposal.md).\n", + "\n", + "**TODO (andreyvelich)**: Add docs link when they are ready.\n", + "\n", + "![train-api.png](attachment:348c13f1-f7df-4148-9c2e-268c05dc1d16.png)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install Training Operator SDK to use `train` API\n", + "\n", + "You have to install `kubeflow-training` SDK with the HuggingFace dependencies to use `train` API.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"kubeflow-training[huggingface] @ git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create PyTorchJob using train API\n", + "\n", + "If your Kubernetes environment [supports `ReadOnlyMany` and `ReadWriteOnce` access modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) for PersistentVolumeClaims (PVCs), you can use more than 1 PyTorchJob worker in `train` API." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from kubeflow.training import TrainingClient\n", + "from kubeflow.storage_initializer.hugging_face import (\n", + " HuggingFaceModelParams,\n", + " HuggingFaceTrainParams,\n", + " HfDatasetParams,\n", + ")\n", + "\n", + "import transformers\n", + "from peft import LoraConfig\n", + "\n", + "job_name_train_api = \"fine-tune-bert-train-api\"\n", + "\n", + "# Set TOKENIZERS_PARALLELISM = false to avoid warnings from Transformers.\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# In this example we will use 1 worker and 1 GPU to fine-tune BERT with `train` API.\n", + "TrainingClient().train(\n", + " name=job_name_train_api,\n", + " num_workers=1, # nnodes parameter for torchrun command.\n", + " num_procs_per_worker=1, # nproc-per-node parameter for torchrun command.\n", + " # BERT model URI and type of Transformer to train it.\n", + " model_provider_parameters=HuggingFaceModelParams(\n", + " model_uri=\"hf://google-bert/bert-base-cased\",\n", + " transformer_type=transformers.AutoModelForSequenceClassification,\n", + " ),\n", + " storage_config={\n", + " \"access_modes\": [\"ReadWriteOnce\"] # Since we use 1 Worker, PVC access mode is ReadWriteOnce.\n", + " },\n", + " # Use 3000 samples from Yelp dataset.\n", + " dataset_provider_parameters=HfDatasetParams(\n", + " repo_id=\"yelp_review_full\",\n", + " split=\"train[:3000]\",\n", + " ),\n", + " # Specify HuggingFace Trainer parameters. In this example, we will skip evaluation and model checkpoints.\n", + " train_parameters=HuggingFaceTrainParams(\n", + " training_parameters=transformers.TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " save_strategy=\"no\",\n", + " evaluation_strategy=\"no\",\n", + " do_eval=False,\n", + " disable_tqdm=True,\n", + " log_level=\"info\",\n", + " ),\n", + " # Set LoRA config to reduce number of trainable model parameters. \n", + " lora_config=LoraConfig(\n", + " r=8,\n", + " lora_alpha=8,\n", + " lora_dropout=0.1,\n", + " bias=\"none\",\n", + " ),\n", + " ),\n", + " resources_per_worker={\n", + " \"gpu\": 1,\n", + " \"cpu\": 5,\n", + " \"memory\": \"10G\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the PyTorchJob containers\n", + "\n", + "When using `train` API, every PyTorchJob worker (Kubernetes Pod) should have `storage-initialize` initContainer and volume.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorchJob Init Containers\n", + "storage-initializer\n", + "----------------------------------------\n", + "PyTorchJob Volumes\n", + "storage-initializer\n" + ] + } + ], + "source": [ + "pytorchjob = TrainingClient().get_job(job_name_train_api)\n", + "\n", + "print(\"PyTorchJob Init Containers\")\n", + "for c in pytorchjob.spec.pytorch_replica_specs[\"Master\"].template.spec.init_containers:\n", + " print(c.name)\n", + "\n", + "print(\"-\" * 40)\n", + "\n", + "print(\"PyTorchJob Volumes\")\n", + "for v in pytorchjob.spec.pytorch_replica_specs[\"Master\"].template.spec.volumes:\n", + " print(v.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the PyTorchJob training logs\n", + "\n", + "Use the same API to get created PyTorchJob logs.\n", + "\n", + "Since we used LoRA config, number of trainable parameters is smaller: **294 912**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:47Z INFO Starting HuggingFace LLM Trainer\n", + "[Pod fine-tune-bert-train-api-master-0]: /usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.\n", + "[Pod fine-tune-bert-train-api-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:47Z INFO Setup model and tokenizer\n", + "[Pod fine-tune-bert-train-api-master-0]: Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "[Pod fine-tune-bert-train-api-master-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Preprocess dataset\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Load and preprocess dataset\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Dataset specification: Dataset({\n", + "[Pod fine-tune-bert-train-api-master-0]: features: ['label', 'text'],\n", + "[Pod fine-tune-bert-train-api-master-0]: num_rows: 3000\n", + "[Pod fine-tune-bert-train-api-master-0]: })\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO ----------------------------------------\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Tokenize dataset\n", + "Map: 100%|██████████| 3000/3000 [00:01<00:00, 2759.84 examples/s]\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Evaluation dataset is not found\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Distributed dataset across PyTorchJob workers. WORLD_SIZE: 1, RANK: 0\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Setup LoRA config for model\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Start model training\n", + "[Pod fine-tune-bert-train-api-master-0]: /usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches']). Please pass an `accelerate.DataLoaderConfiguration` instead: \n", + "[Pod fine-tune-bert-train-api-master-0]: dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)\n", + "[Pod fine-tune-bert-train-api-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-train-api-master-0]: The following columns in the training set don't have a corresponding argument in `PeftModel.forward` and have been ignored: text. If text are not expected by `PeftModel.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-train-api-master-0]: ***** Running training *****\n", + "[Pod fine-tune-bert-train-api-master-0]: Num examples = 3,000\n", + "[Pod fine-tune-bert-train-api-master-0]: Num Epochs = 3\n", + "[Pod fine-tune-bert-train-api-master-0]: Instantaneous batch size per device = 8\n", + "[Pod fine-tune-bert-train-api-master-0]: Total train batch size (w. parallel, distributed & accumulation) = 8\n", + "[Pod fine-tune-bert-train-api-master-0]: Gradient Accumulation steps = 1\n", + "[Pod fine-tune-bert-train-api-master-0]: Total optimization steps = 1,125\n", + "[Pod fine-tune-bert-train-api-master-0]: Number of trainable parameters = 294,912\n", + "[Pod fine-tune-bert-train-api-master-0]: [W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:47Z INFO Starting HuggingFace LLM Trainer\n", + "[Pod fine-tune-bert-train-api-master-0]: /usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1741: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--hub_token` instead.\n", + "[Pod fine-tune-bert-train-api-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:47Z INFO Setup model and tokenizer\n", + "[Pod fine-tune-bert-train-api-master-0]: Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "[Pod fine-tune-bert-train-api-master-0]: You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Preprocess dataset\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Load and preprocess dataset\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Dataset specification: Dataset({\n", + "[Pod fine-tune-bert-train-api-master-0]: features: ['label', 'text'],\n", + "[Pod fine-tune-bert-train-api-master-0]: num_rows: 3000\n", + "[Pod fine-tune-bert-train-api-master-0]: })\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO ----------------------------------------\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:48Z INFO Tokenize dataset\n", + "Map: 100%|██████████| 3000/3000 [00:01<00:00, 2759.84 examples/s]\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Evaluation dataset is not found\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Distributed dataset across PyTorchJob workers. WORLD_SIZE: 1, RANK: 0\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Setup LoRA config for model\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:45:51Z INFO Start model training\n", + "[Pod fine-tune-bert-train-api-master-0]: /usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py:432: FutureWarning: Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches']). Please pass an `accelerate.DataLoaderConfiguration` instead: \n", + "[Pod fine-tune-bert-train-api-master-0]: dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)\n", + "[Pod fine-tune-bert-train-api-master-0]: warnings.warn(\n", + "[Pod fine-tune-bert-train-api-master-0]: The following columns in the training set don't have a corresponding argument in `PeftModel.forward` and have been ignored: text. If text are not expected by `PeftModel.forward`, you can safely ignore this message.\n", + "[Pod fine-tune-bert-train-api-master-0]: ***** Running training *****\n", + "[Pod fine-tune-bert-train-api-master-0]: Num examples = 3,000\n", + "[Pod fine-tune-bert-train-api-master-0]: Num Epochs = 3\n", + "[Pod fine-tune-bert-train-api-master-0]: Instantaneous batch size per device = 8\n", + "[Pod fine-tune-bert-train-api-master-0]: Total train batch size (w. parallel, distributed & accumulation) = 8\n", + "[Pod fine-tune-bert-train-api-master-0]: Gradient Accumulation steps = 1\n", + "[Pod fine-tune-bert-train-api-master-0]: Total optimization steps = 1,125\n", + "[Pod fine-tune-bert-train-api-master-0]: Number of trainable parameters = 294,912\n", + "[Pod fine-tune-bert-train-api-master-0]: [W reducer.cpp:1346] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "[Pod fine-tune-bert-train-api-master-0]: {'loss': 0.7481, 'learning_rate': 2.777777777777778e-05, 'epoch': 1.33}\n", + "[Pod fine-tune-bert-train-api-master-0]: {'loss': 0.9313, 'learning_rate': 5.555555555555556e-06, 'epoch': 2.67}\n", + "[Pod fine-tune-bert-train-api-master-0]: Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "[Pod fine-tune-bert-train-api-master-0]: {'train_runtime': 234.849, 'train_samples_per_second': 38.322, 'train_steps_per_second': 4.79, 'train_loss': 0.8460628526475694, 'epoch': 3.0}\n", + "[Pod fine-tune-bert-train-api-master-0]: 2024-03-15T16:49:47Z INFO Training is complete\n" + ] + } + ], + "source": [ + "logs, _ = TrainingClient().get_job_logs(job_name_train_api, follow=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-01T23:44:15.511173Z", + "iopub.status.busy": "2024-03-01T23:44:15.510932Z", + "iopub.status.idle": "2024-03-01T23:44:15.539921Z", + "shell.execute_reply": "2024-03-01T23:44:15.539352Z", + "shell.execute_reply.started": "2024-03-01T23:44:15.511155Z" + }, + "tags": [] + }, + "source": [ + "## Delete the PyTorchJobs\n", + "\n", + "You can delete the created PyTorchJobs." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TrainingClient().delete_job(name=job_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "TrainingClient().delete_job(name=job_name_train_api)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/sdk/python/kubeflow/storage_initializer/Dockerfile b/sdk/python/kubeflow/storage_initializer/Dockerfile index 558e3553da..75bd667c87 100644 --- a/sdk/python/kubeflow/storage_initializer/Dockerfile +++ b/sdk/python/kubeflow/storage_initializer/Dockerfile @@ -4,14 +4,14 @@ FROM python:3.11 # Set the working directory in the container WORKDIR /app -# Copy the Python package and its source code into the container -COPY . /app/storage_initializer - # Copy the requirements.txt file into the container COPY requirements.txt /app/requirements.txt # Install any needed packages specified in requirements.txt RUN pip install --no-cache-dir -r requirements.txt +# Copy the Python package and its source code into the container +COPY . /app/storage_initializer + # Run storage.py when the container launches ENTRYPOINT ["python", "-m", "storage_initializer.storage"] diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index 06fb3f5b50..0d4d344aab 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -1,9 +1,12 @@ +import logging +import json +from typing import Union, Optional from dataclasses import dataclass, field +from urllib.parse import urlparse + import transformers from peft import LoraConfig -from urllib.parse import urlparse -import json, os -from typing import Union + from .constants import VOLUME_PATH_DATASET, VOLUME_PATH_MODEL from .abstract_model_provider import modelProvider from .abstract_dataset_provider import datasetProvider @@ -19,6 +22,17 @@ ] +# Configure logger. +log_formatter = logging.Formatter( + "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" +) +logger = logging.getLogger(__file__) +console_handler = logging.StreamHandler() +console_handler.setFormatter(log_formatter) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) + + @dataclass class HuggingFaceModelParams: model_uri: str @@ -46,7 +60,8 @@ def load_config(self, serialised_args): def download_model_and_tokenizer(self): # implementation for downloading the model - print("downloading model") + logger.info("Downloading model") + logger.info("-" * 40) transformer_type_class = getattr(transformers, self.config.transformer_type) parsed_uri = urlparse(self.config.model_uri) self.model = parsed_uri.netloc + parsed_uri.path @@ -64,7 +79,9 @@ def download_model_and_tokenizer(self): @dataclass class HfDatasetParams: repo_id: str - access_token: str = None + access_token: Optional[str] = None + # TODO (andreyvelich): Discuss where we should specify dataset preprocess parameters. + split: Optional[str] = None def __post_init__(self): # Custom checks or validations can be added here @@ -77,11 +94,14 @@ def load_config(self, serialised_args): self.config = HfDatasetParams(**json.loads(serialised_args)) def download_dataset(self): - print("downloading dataset") + logger.info("Downloading dataset") + logger.info("-" * 40) import huggingface_hub from datasets import load_dataset if self.config.access_token: huggingface_hub.login(self.config.access_token) - load_dataset(self.config.repo_id, cache_dir=VOLUME_PATH_DATASET) + # Load dataset and save to disk. + dataset = load_dataset(self.config.repo_id, split=self.config.split) + dataset.save_to_disk(VOLUME_PATH_DATASET) diff --git a/sdk/python/kubeflow/storage_initializer/requirements.txt b/sdk/python/kubeflow/storage_initializer/requirements.txt index 7edab476e8..dd896ecae7 100644 --- a/sdk/python/kubeflow/storage_initializer/requirements.txt +++ b/sdk/python/kubeflow/storage_initializer/requirements.txt @@ -1,8 +1,5 @@ -einops>=0.6.1 -transformers_stream_generator==0.0.4 -boto3==1.33.9 -transformers>=4.20.0 peft==0.3.0 -huggingface_hub==0.16.4 -datasets>=2.13.2 - +datasets==2.15.0 +transformers==4.37.2 +boto3==1.33.9 +huggingface_hub==0.19.3 diff --git a/sdk/python/kubeflow/storage_initializer/s3.py b/sdk/python/kubeflow/storage_initializer/s3.py index 89a3647b4a..506817750e 100644 --- a/sdk/python/kubeflow/storage_initializer/s3.py +++ b/sdk/python/kubeflow/storage_initializer/s3.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field -import json, os -import boto3 +import json +import os from urllib.parse import urlparse from .abstract_dataset_provider import datasetProvider from .constants import VOLUME_PATH_DATASET @@ -39,6 +39,8 @@ def load_config(self, serialised_args): self.config = S3DatasetParams(**json.loads(serialised_args)) def download_dataset(self): + import boto3 + # Create an S3 client for Nutanix Object Store/S3 s3_client = boto3.client( "s3", diff --git a/sdk/python/kubeflow/storage_initializer/storage.py b/sdk/python/kubeflow/storage_initializer/storage.py index 73937ad822..f65d9d324c 100644 --- a/sdk/python/kubeflow/storage_initializer/storage.py +++ b/sdk/python/kubeflow/storage_initializer/storage.py @@ -42,7 +42,7 @@ def dataset_factory(dataset_provider, dataset_provider_parameters): parser.add_argument( "--dataset_provider_parameters", type=str, - help="dataset provider serialised arguments", + help="dataset provider serialized arguments", ) args = parser.parse_args() diff --git a/sdk/python/kubeflow/trainer/Dockerfile b/sdk/python/kubeflow/trainer/Dockerfile index d82b715552..d0ebee4aa3 100644 --- a/sdk/python/kubeflow/trainer/Dockerfile +++ b/sdk/python/kubeflow/trainer/Dockerfile @@ -4,15 +4,14 @@ FROM nvcr.io/nvidia/pytorch:23.10-py3 # Set the working directory in the container WORKDIR /app -# Copy the Python package and its source code into the container -COPY . /app - # Copy the requirements.txt file into the container - COPY requirements.txt /app/requirements.txt +COPY requirements.txt /app/requirements.txt # Install any needed packages specified in requirements.txt RUN pip install --no-cache-dir -r requirements.txt +# Copy the Python package and its source code into the container +COPY . /app + # Run storage.py when the container launches ENTRYPOINT ["torchrun", "hf_llm_training.py"] - \ No newline at end of file diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index c39c547c83..26dd4fbe0e 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -1,45 +1,52 @@ import argparse +import logging +from urllib.parse import urlparse +import json +import os + +from datasets import load_from_disk, Dataset +from datasets.distributed import split_dataset_by_node +from peft import LoraConfig, get_peft_model import transformers from transformers import ( AutoModelForCausalLM, AutoTokenizer, - AutoConfig, + AutoModelForImageClassification, TrainingArguments, DataCollatorForLanguageModeling, Trainer, ) -import torch -from datasets import load_dataset -from peft import LoraConfig, get_peft_model -from urllib.parse import urlparse -import os -import json + + +# Configure logger. +log_formatter = logging.Formatter( + "%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%dT%H:%M:%SZ" +) +logger = logging.getLogger(__file__) +console_handler = logging.StreamHandler() +console_handler.setFormatter(log_formatter) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): # Set up the model and tokenizer parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - transformer_type_class = getattr(transformers, transformer_type) - model = transformer_type_class.from_pretrained( + model = transformer_type.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, local_files_only=True, - device_map="auto", trust_remote_code=True, ) - tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=model_dir, local_files_only=True, - device_map="auto", ) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.add_pad_token = True - # Freeze model parameters for param in model.parameters(): param.requires_grad = False @@ -47,24 +54,55 @@ def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): return model, tokenizer -def load_and_preprocess_data(dataset_name, dataset_dir, transformer_type, tokenizer): +def load_and_preprocess_data(dataset_dir, transformer_type, tokenizer): # Load and preprocess the dataset - print("loading dataset") - transformer_type_class = getattr(transformers, transformer_type) - if transformer_type_class != transformers.AutoModelForImageClassification: - dataset = load_dataset(dataset_name, cache_dir=dataset_dir).map( - lambda x: tokenizer(x["text"]), batched=True + logger.info("Load and preprocess dataset") + + if transformer_type != AutoModelForImageClassification: + dataset = load_from_disk(dataset_dir) + + logger.info(f"Dataset specification: {dataset}") + logger.info("-" * 40) + + logger.info("Tokenize dataset") + # TODO (andreyvelich): Discuss how user should set the tokenizer function. + dataset = dataset.map( + lambda x: tokenizer(x["text"], padding="max_length", truncation=True), + batched=True, ) else: - dataset = load_dataset(dataset_name, cache_dir=dataset_dir) + dataset = load_from_disk(dataset_dir) - train_data = dataset["train"] + # Check if dataset contains `train` key. Otherwise, load full dataset to train_data. + if "train" in dataset: + train_data = dataset["train"] + else: + train_data = dataset try: eval_data = dataset["eval"] - except Exception as err: + except Exception: eval_data = None - print("Evaluation dataset is not found") + logger.info("Evaluation dataset is not found") + + # Distribute dataset across PyTorchJob workers. + RANK = int(os.environ["RANK"]) + WORLD_SIZE = int(os.environ["WORLD_SIZE"]) + logger.info( + f"Distributed dataset across PyTorchJob workers. WORLD_SIZE: {WORLD_SIZE}, RANK: {RANK}" + ) + if isinstance(train_data, Dataset): + train_data = split_dataset_by_node( + train_data, + rank=RANK, + world_size=WORLD_SIZE, + ) + if isinstance(eval_data, Dataset): + eval_data = split_dataset_by_node( + eval_data, + rank=RANK, + world_size=WORLD_SIZE, + ) return train_data, eval_data @@ -77,20 +115,27 @@ def setup_peft_model(model, lora_config): return model -def train_model(model, train_data, eval_data, tokenizer, train_args): - # Train the model +def train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args): + # Setup the Trainer. trainer = Trainer( model=model, train_dataset=train_data, eval_dataset=eval_data, - tokenizer=tokenizer, args=train_args, - data_collator=DataCollatorForLanguageModeling( - tokenizer, pad_to_multiple_of=8, mlm=False - ), ) + + # TODO (andreyvelich): Currently, data collator is supported only for casual LM Transformer. + if transformer_type == AutoModelForCausalLM: + logger.info("Add data collector for language modeling") + logger.info("-" * 40) + trainer.data_collator = DataCollatorForLanguageModeling( + tokenizer, + pad_to_multiple_of=8, + mlm=False, + ) + + # Train the model. trainer.train() - print("training done") def parse_arguments(): @@ -101,8 +146,7 @@ def parse_arguments(): parser.add_argument("--model_uri", help="model uri") parser.add_argument("--transformer_type", help="model transformer type") parser.add_argument("--model_dir", help="directory containing model") - parser.add_argument("--dataset_dir", help="directory contaning dataset") - parser.add_argument("--dataset_name", help="dataset name") + parser.add_argument("--dataset_dir", help="directory containing dataset") parser.add_argument("--lora_config", help="lora_config") parser.add_argument( "--training_parameters", help="hugging face training parameters" @@ -112,13 +156,25 @@ def parse_arguments(): if __name__ == "__main__": + logger.info("Starting HuggingFace LLM Trainer") args = parse_arguments() train_args = TrainingArguments(**json.loads(args.training_parameters)) + transformer_type = getattr(transformers, args.transformer_type) + + logger.info("Setup model and tokenizer") model, tokenizer = setup_model_and_tokenizer( - args.model_uri, args.transformer_type, args.model_dir + args.model_uri, transformer_type, args.model_dir ) + + logger.info("Preprocess dataset") train_data, eval_data = load_and_preprocess_data( - args.dataset_name, args.dataset_dir, args.transformer_type, tokenizer + args.dataset_dir, transformer_type, tokenizer ) + + logger.info("Setup LoRA config for model") model = setup_peft_model(model, args.lora_config) - train_model(model, train_data, eval_data, tokenizer, train_args) + + logger.info("Start model training") + train_model(model, transformer_type, train_data, eval_data, tokenizer, train_args) + + logger.info("Training is complete") diff --git a/sdk/python/kubeflow/trainer/requirements.txt b/sdk/python/kubeflow/trainer/requirements.txt index 17a594c75d..57d11e30b7 100644 --- a/sdk/python/kubeflow/trainer/requirements.txt +++ b/sdk/python/kubeflow/trainer/requirements.txt @@ -1,5 +1,3 @@ peft==0.3.0 datasets==2.15.0 -transformers>=4.20.0 -bitsandbytes>=0.42.0 -einops>=0.6.1 +transformers==4.37.2 diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 573dd2d162..0541f9fa4a 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -99,9 +99,10 @@ def train( namespace: Optional[str] = None, num_workers: int = 1, num_procs_per_worker: int = 1, - storage_config: Dict[str, Optional[str]] = { - "size": "10Gi", + storage_config: Dict[str, Optional[Union[str, List[str]]]] = { + "size": constants.PVC_DEFAULT_SIZE, "storage_class": None, + "access_modes": constants.PVC_DEFAULT_ACCESS_MODES, }, model_provider_parameters=None, dataset_provider_parameters=None, @@ -125,7 +126,6 @@ def train( from kubeflow.storage_initializer.s3 import S3DatasetParams from kubeflow.storage_initializer.hugging_face import ( HuggingFaceModelParams, - HuggingFaceTrainParams, HfDatasetParams, ) @@ -161,7 +161,7 @@ def train( ) break else: - raise RuntimeError("failed to create pvc") + raise RuntimeError(f"failed to create PVC. Error: {e}") if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" @@ -211,8 +211,6 @@ def train( VOLUME_PATH_MODEL, "--dataset_dir", VOLUME_PATH_DATASET, - "--dataset_name", - dataset_name, "--lora_config", json.dumps(train_parameters.lora_config.__dict__, cls=utils.SetEncoder), "--training_parameters", @@ -225,7 +223,6 @@ def train( # create worker pod spec worker_pod_template_spec = utils.get_pod_template_spec( containers=[container_spec], - init_containers=[init_container_spec], volumes=[constants.STORAGE_INITIALIZER_VOLUME], ) diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index a2c59fcbc6..0513c3e31e 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -71,6 +71,13 @@ # Constants for Train API. STORAGE_INITIALIZER = "storage-initializer" +# The default value for dataset and model storage PVC. +PVC_DEFAULT_SIZE = "10Gi" +# The default value for PVC access modes. +PVC_DEFAULT_ACCESS_MODES = ["ReadWriteOnce", "ReadOnlyMany"] + + +# TODO (andreyvelich): We should add image tag for Storage Initializer and Trainer. STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" STORAGE_INITIALIZER_VOLUME_MOUNT = models.V1VolumeMount( diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 5480044fa9..06f76b2164 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -284,27 +284,27 @@ def get_tfjob_template( # Add Chief, PS, and Worker replicas to the TFJob. if num_chief_replicas is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_CHIEF - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=num_chief_replicas, - template=pod_template_spec, + tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_CHIEF] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=num_chief_replicas, + template=pod_template_spec, + ) ) if num_ps_replicas is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_PS - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=num_ps_replicas, - template=pod_template_spec, + tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_PS] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=num_ps_replicas, + template=pod_template_spec, + ) ) if num_workers is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_WORKER - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=num_workers, - template=pod_template_spec, + tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_WORKER] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=num_workers, + template=pod_template_spec, + ) ) return tfjob @@ -343,19 +343,19 @@ def get_pytorchjob_template( # Create Master replica if that is set. if master_pod_template_spec: - pytorchjob.spec.pytorch_replica_specs[ - constants.REPLICA_TYPE_MASTER - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=1, - template=master_pod_template_spec, + pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=master_pod_template_spec, + ) ) # If we don't define Master template, use the Worker template. else: - pytorchjob.spec.pytorch_replica_specs[ - constants.REPLICA_TYPE_MASTER - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=1, - template=worker_pod_template_spec, + pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=worker_pod_template_spec, + ) ) # Create Worker with num_workers - 1 replicas. @@ -364,11 +364,11 @@ def get_pytorchjob_template( # doesn't set RANK and WORLD_SIZE for PyTorchJob. # Ref issue: https://github.com/kubeflow/training-operator/issues/1991 if num_workers > 1: - pytorchjob.spec.pytorch_replica_specs[ - constants.REPLICA_TYPE_WORKER - ] = models.KubeflowOrgV1ReplicaSpec( - replicas=num_workers - 1, - template=worker_pod_template_spec, + pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_WORKER] = ( + models.KubeflowOrgV1ReplicaSpec( + replicas=num_workers - 1, + template=worker_pod_template_spec, + ) ) return pytorchjob @@ -377,17 +377,23 @@ def get_pytorchjob_template( def get_pvc_spec( pvc_name: str, namespace: str, - storage_config: Dict[str, Optional[str]], + storage_config: Dict[str, Optional[Union[str, List[str]]]], ): - if pvc_name is None or namespace is None or "size" not in storage_config: - raise ValueError("One of the arguments is None") + if pvc_name is None or namespace is None: + raise ValueError("One of the required storage config argument is None") + + if "size" not in storage_config: + storage_config["size"] = constants.PVC_DEFAULT_SIZE + + if "access_modes" not in storage_config: + storage_config["access_modes"] = constants.PVC_DEFAULT_ACCESS_MODES pvc_spec = models.V1PersistentVolumeClaim( api_version="v1", kind="PersistentVolumeClaim", metadata={"name": pvc_name, "namepsace": namespace}, spec=models.V1PersistentVolumeClaimSpec( - access_modes=["ReadWriteOnce", "ReadOnlyMany"], + access_modes=storage_config["access_modes"], resources=models.V1ResourceRequirements( requests={"storage": storage_config["size"]} ), diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 2e29c22f2d..536a81483c 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -64,6 +64,6 @@ tests_require=TESTS_REQUIRES, extras_require={ "test": TESTS_REQUIRES, - "huggingface": ["transformers>=4.20.0", "peft==0.3.0"], + "huggingface": ["transformers==4.37.2", "peft==0.3.0"], }, )