From 9fd91cf6348ff5a1cb105b84051d738d893b70d4 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Wed, 13 Mar 2024 21:53:37 -0700 Subject: [PATCH] Validation (#1027) * add validation script * update * change token count function * reorganize cells * Add unit tests * Add a printout for CPT * update question * Add questions * Fix lints * update format * update * nb source * add validation script * update * change token count function * reorganize cells * Add unit tests * Add a printout for CPT * update question * Add questions * Fix lints * update format * update * nb source * Remove license insert for validation notebook * Add validation utils * Minor cleanups (#858) * nits * logger * add log * lint * update utils/__init__.py to include extra validation functions * update notebook * update * update * Read UC delta table (#773) * initial commit * use databricks-sql to read delta table and convert to json * update * update * update * add mocked unittest * Fix lints * update * update * restructure code * Add timer for optimizing * Add db-connect * add wrapper * update * add install dbconnect * update * update * patch dbconnect to allow multiple return formats * update * add arrow * use compression * clean up * Add cluster rt check * Fix lints * remove patch.py for CI * update * update * updat * update * fix tests * fix lint * update * update * Add more tests * update * update * update * change to download_json * update * fix lints * Add decompressed option for arrow * format json to jsonl * Add comments * Make cf_collect_type global option * fix comments * fix lints * fix comments * Fix lints * change to use workspaceclient * Add CPT support * Rewire method assignment logic * Fix bug in stripping https * Add tests for rewired method assignment logic * Fix lints * Fix lints * Removed logger set_level * Remove pyspark. It conflicts with databricks-connect * Update the comment * skip cluster version check when cluster_id is serverless * Add use_serverless flag * update tests with use_serverless flag * Fix lints --------- Co-authored-by: Xiaohan Zhang * Add download remote function to util * update * remove fused layernorm (#859) * update * update * update * update * update * update * update * update * update * Remove hardcoded combined.jsonl with a flag (#861) * Remove hardcoded combined.jsonl with a flag * update * change output_json_path output_json_folder --------- Co-authored-by: Xiaohan Zhang * bump (#828) * Add dask and dataframe_to_mds * update * update * update * update * Add notebook * update * update * remove script and tests, keep notebook * update * update * update * update * Always initialize dist (#864) * fix dev * lint * remove gpu * updated notebook * remove scripts keep notebook * update notebook. rephrase. * update * Add response tokens * update * update * Disable MDSWrite, return token counts * Change plot settings * update notebook * update * update notebook * update * update notebook * update pip install link * Change done file location * Create the dest folder * update notebook * update --------- Co-authored-by: Xiaohan Zhang Co-authored-by: xiaohanzhan-db Co-authored-by: Mihir Patel --- notebooks/validate_and_tokenize_data.ipynb | 184 ++++++++++++++++++--- 1 file changed, 163 insertions(+), 21 deletions(-) diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb index b7048a55d7..63d40a2a04 100644 --- a/notebooks/validate_and_tokenize_data.ipynb +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "f275a21b-47d4-472c-972b-e2a84a597db2", "showTitle": false, @@ -54,7 +57,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d", "showTitle": false, @@ -173,6 +179,7 @@ "import re\n", "import json\n", "import tempfile\n", + "import random\n", "import numpy as np\n", "import pandas as pd \n", "from collections import defaultdict\n", @@ -193,7 +200,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd", "showTitle": false, @@ -208,7 +218,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa", "showTitle": false, @@ -255,6 +268,29 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0d1f2e9e-db40-41fd-a6b9-bb4757db08b0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Make sure you have write access to the ``home`` directory\n", + "home = os.path.join('/local_disk0', 'ift')\n", + "os.makedirs(home, exist_ok=True)\n", + "os.chdir(home)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -271,22 +307,26 @@ "source": [ "FT_API_args = Namespace(\n", " model= 'mosaicml/mpt-7b', # Other examples: 'EleutherAI/gpt-neox-20b',\n", - " train_data_path= 'main.streaming.random_large_table', # Other examples: 'tatsu-lab/alpaca/train', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl' # 'mosaicml/dolly_hhrlhf/train'\n", + " train_data_path= 'mosaicml/dolly_hhrlhf/train', # Other examples: '/path/to/train.jsonl', 'catalog.schema.table'\n", " task_type='INSTRUCTION_FINETUNE',\n", " training_duration=3,\n", " context_length=2048,\n", ")\n", "\n", - "temporary_jsonl_data_path = '/Volumes/main/mosaic_hackathon/managed-volume/IFT/ft_data_11Jan24_3/train'\n", - "os.environ['HF_DATASETS_CACHE'] = '/tmp/'\n", - "os.makedirs(temporary_jsonl_data_path, exist_ok=True)" + "temporary_jsonl_data_path = os.path.join(home, 'ft_data_11Jan24_3/train')\n", + "os.environ['HF_DATASETS_CACHE'] = os.path.join(home, 'hf_cache')\n", + "os.makedirs(temporary_jsonl_data_path, exist_ok=True)\n", + "os.makedirs(os.environ['HF_DATASETS_CACHE'], exist_ok=True)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69", "showTitle": false, @@ -362,7 +402,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356", "showTitle": false, @@ -377,7 +420,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "1a28320a-a2a1-4f3c-a0cd-ad6045a24f64", "showTitle": false, @@ -467,7 +513,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1", "showTitle": false, @@ -506,7 +555,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "7249e9e6-1ea7-4fc9-8959-8a17d62a9fb4", "showTitle": false, @@ -547,7 +599,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a", "showTitle": false, @@ -562,7 +617,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d", "showTitle": false, @@ -624,12 +682,66 @@ "source": [ "FT_API_args = Namespace(\n", " model= 'mosaicml/mpt-7b',\n", - " train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT',\n", + " train_data_path= os.path.join(home, 'ABT'), # this is the path to your collection of txt files\n", " task_type='CONTINUED_PRETRAIN',\n", " training_duration=3,\n", - " context_length=2048,\n", + " context_length=8,\n", ")\n", - "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/{your_username}/mds_data_11Jan24_5'" + "temporary_mds_output_path = os.path.join(home, 'mds_data_11Jan24_5')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fc2e4e8b-7700-47c4-bb21-ae4c389f39a2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Generate a synthetic dataset. Replace train_data_path with your raw data path in practice." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "10f08422-5091-4e64-b3f7-54928584cd60", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def generate_synthetic_dataset(folder_path, num_files=128):\n", + " \"\"\"Generate a synthetic dataset of text files with random words.\"\"\"\n", + " def generate_random_words(num_words=50):\n", + " words = [\"apple\", \"banana\", \"cherry\", \"date\", \"elderberry\", \"fig\", \"grape\", \"honeydew\", \"kiwi\", \"lemon\", \"mango\", \"nectarine\", \"orange\", \"papaya\", \"quince\", \"raspberry\", \"strawberry\", \"tangerine\", \"ugli\", \"vanilla\", \"watermelon\", \"xigua\", \"yam\", \"zucchini\"]\n", + " return ' '.join(random.choice(words) for _ in range(num_words))\n", + "\n", + " if not os.path.exists(folder_path):\n", + " os.makedirs(folder_path)\n", + " \n", + " for i in range(num_files):\n", + " file_path = os.path.join(folder_path, f\"file_{i}.txt\")\n", + " with open(file_path, 'w') as file:\n", + " file.write(generate_random_words())\n", + "\n", + " print(f\"Generated {num_files} files in '{folder_path}'.\")\n", + "\n", + "generate_synthetic_dataset(FT_API_args.train_data_path)" ] }, { @@ -656,7 +768,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3", "showTitle": false, @@ -688,6 +803,27 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f5aea2a8-db29-40c9-8ed2-b6a1d032e7ab", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import os\n", + "os.makedirs(temporary_mds_output_path, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -734,7 +870,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b", "showTitle": false, @@ -776,7 +915,10 @@ "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "e123669c-2f77-4d66-93eb-04efd546f39f", "showTitle": false,