From a7c884fc17da873e9e822f45edf5754742f37253 Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Fri, 20 Oct 2023 00:14:21 +0000 Subject: [PATCH 01/11] added text summarization notebook --- .../Text-Summerization-gptipynb.ipynb | 1974 +++++++++++++++++ 1 file changed, 1974 insertions(+) create mode 100644 docs/notebooks/Text-Summerization-gptipynb.ipynb diff --git a/docs/notebooks/Text-Summerization-gptipynb.ipynb b/docs/notebooks/Text-Summerization-gptipynb.ipynb new file mode 100644 index 00000000..a8146f28 --- /dev/null +++ b/docs/notebooks/Text-Summerization-gptipynb.ipynb @@ -0,0 +1,1974 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "4vXfYHX6QSJu" + }, + "source": [ + "# Generate Synthetic text summarization with Gretel GPT\n", + "\n", + "* In this notebook we use Gretel GPT with Llama-2 7b model to create synthetic text summerization dataset. \n", + "* To run this notebook, you will need an API key from the [Gretel Console](https://console.gretel.ai/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GhwZL2atTilv" + }, + "source": [ + "## Getting Started" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V_iIkqnUQK2l" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -U gretel-client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kixD67x_TSC4" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from gretel_client import configure_session\n", + "from gretel_client.helpers import poll\n", + "from gretel_client.projects import create_or_get_unique_project, get_project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mlQIp_uGTTgo", + "outputId": "e27df360-d8b3-46ea-df4a-4244f6c3373a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Caching Gretel config to disk.\n", + "Using endpoint https://api-dev.gretel.cloud\n", + "Logged in as marjan@gretel.ai ✅\n" + ] + } + ], + "source": [ + "# Log into Gretel\n", + "configure_session(api_key=\"prompt\", cache=\"yes\", endpoint=\"https://api-dev.gretel.cloud\", validate=True, clear=True)\n", + "\n", + "pd.set_option('max_colwidth', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0pMwi0RghUzh" + }, + "source": [ + "## Load and preview training data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 234 + }, + "id": "_QyG3jfRh2-i", + "outputId": "a60c1c7e-b71e-4843-cfb4-784000730546" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dialogue_and_summary
0**dialogue**\\nLeon: I asked you to lend me your camera\\r\\nItzel: You can come and take it from my home\\r\\nLeon: Would be there in an hour\\n\\n**summary**\\nLeon will borrow Itzel's camera.
1**dialogue**\\nJayleen: I'm dyeing my hair\\r\\nAugust: What colour?\\r\\nJayleen: I'm staying with my blonde. I had to refresh my colour\\r\\nAugust: Ok\\r\\nJayleen: I haven't dyed my hair for around 9 months xd\\n\\n**summary**\\nJayleen is dyeing her hair blonde for the first time in 9 months.
2**dialogue**\\nLiam: I don't think the institutional approach is too interesting\\nJeff: I agree...\\nTom: so let's try to find an alternative\\n\\n**summary**\\nLiam and Jeff do not find institutional approach interesting.
3**dialogue**\\nGina: <file_photo>\\r\\nGina: What do you think?\\r\\nKate: Grab it! At that price it is an absolute bargain.\\n\\n**summary**\\nKate wants Gina to buy it because it's cheap.
4**dialogue**\\nAndrew: Hello Janny, is it still convenient for us to come and check your gas meter at 2.45 today?\\r\\nJanny: Hi Andrew, that's fine. \\r\\nAndrew: Thank you, we will see you then\\n\\n**summary**\\nAndrew will come to Janny to check her gas meter at 2.45 today.
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "text/plain": [ + " dialogue_and_summary\n", + "0 **dialogue**\\nLeon: I asked you to lend me your camera\\r\\nItzel: You can come and take it from my home\\r\\nLeon: Would be there in an hour\\n\\n**summary**\\nLeon will borrow Itzel's camera. \n", + "1 **dialogue**\\nJayleen: I'm dyeing my hair\\r\\nAugust: What colour?\\r\\nJayleen: I'm staying with my blonde. I had to refresh my colour\\r\\nAugust: Ok\\r\\nJayleen: I haven't dyed my hair for around 9 months xd\\n\\n**summary**\\nJayleen is dyeing her hair blonde for the first time in 9 months.\n", + "2 **dialogue**\\nLiam: I don't think the institutional approach is too interesting\\nJeff: I agree...\\nTom: so let's try to find an alternative\\n\\n**summary**\\nLiam and Jeff do not find institutional approach interesting.\n", + "3 **dialogue**\\nGina: \\r\\nGina: What do you think?\\r\\nKate: Grab it! At that price it is an absolute bargain.\\n\\n**summary**\\nKate wants Gina to buy it because it's cheap. \n", + "4 **dialogue**\\nAndrew: Hello Janny, is it still convenient for us to come and check your gas meter at 2.45 today?\\r\\nJanny: Hi Andrew, that's fine. \\r\\nAndrew: Thank you, we will see you then\\n\\n**summary**\\nAndrew will come to Janny to check her gas meter at 2.45 today." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Specify a dataset to train on\n", + "DATASET_PATH = 'https://gretel-datasets.s3.us-west-2.amazonaws.com/Text-dataset/Samsum-text-summerization-sample-1000.csv'\n", + "df = pd.read_csv(DATASET_PATH)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4PD5B0U06ALs" + }, + "source": [ + "## Configure and Train the Synthetic Model:\n", + "\n", + "We can experiment different \"steps\" parameters which result in a change of text SQS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HpjRvCmjU5qG", + "outputId": "2bd9311d-d749-4431-a679-fd2a8cc69e27" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO: Starting poller\n", + "INFO: Status is created. Model creation has been queued.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"uid\": \"6531b8e8f3bf601ba821bc39\",\n", + " \"guid\": \"model_2X0E5sAljm7RZuBoGNYOhziDYM3\",\n", + " \"model_name\": \"data-summarization-llama-2-7b\",\n", + " \"runner_mode\": \"cloud\",\n", + " \"user_id\": \"621e70bf492fbf0535537ea1\",\n", + " \"user_guid\": \"user_25nTzH09cLJdsemHxZVO2SdOc8u\",\n", + " \"billing_domain\": \"gretel.ai\",\n", + " \"billing_domain_guid\": \"domain_28bzIokk1eQdWUYsovba0VN1gtY\",\n", + " \"project_id\": \"6531b8e7478d822b693c19f6\",\n", + " \"project_guid\": \"proj_2X0E5l1tcgBRbcLBtBwtHNYVeYH\",\n", + " \"status_history\": {\n", + " \"created\": \"2023-10-19T23:16:56.300401Z\"\n", + " },\n", + " \"last_modified\": \"2023-10-19T23:16:56.408869Z\",\n", + " \"status\": \"created\",\n", + " \"last_active_hb\": null,\n", + " \"duration_minutes\": null,\n", + " \"error_msg\": null,\n", + " \"error_id\": null,\n", + " \"traceback\": null,\n", + " \"annotations\": null,\n", + " \"provenance\": null,\n", + " \"container_image\": \"074762682575.dkr.ecr.us-east-2.amazonaws.com/models/gpt_x@sha256:28ab363cd8f7687570a8c8470d3e2c4391b5b31ee64f3ef05970d3e8943c2d6e\",\n", + " \"container_image_version\": \"6eb73a3b\",\n", + " \"model_type\": \"gpt_x\",\n", + " \"model_type_alias\": null,\n", + " \"project_name\": \"data-summarization\",\n", + " \"config\": {\n", + " \"schema_version\": \"1.0\",\n", + " \"name\": \"data-summarization-llama-2-7b\",\n", + " \"models\": [\n", + " {\n", + " \"gpt_x\": {\n", + " \"data_source\": [\n", + " \"gretel_450b7de87b3145d4a365c75ec5848065_dataframe-8e6ff378-1c8c-4654-917f-9a74ba805d8e.csv\"\n", + " ],\n", + " \"ref_data\": {},\n", + " \"pretrained_model\": \"gretelai/mpt-7b\",\n", + " \"column_name\": null,\n", + " \"validation\": null,\n", + " \"params\": {\n", + " \"batch_size\": 4,\n", + " \"epochs\": null,\n", + " \"steps\": 750,\n", + " \"weight_decay\": 0.01,\n", + " \"warmup_steps\": 100,\n", + " \"lr_scheduler\": \"linear\",\n", + " \"learning_rate\": 0.0002,\n", + " \"max_tokens\": 512\n", + " },\n", + " \"generate\": {\n", + " \"num_records\": 80,\n", + " \"seed_records_multiplier\": 1,\n", + " \"maximum_text_length\": 100,\n", + " \"top_p\": 0.8987601335810778,\n", + " \"top_k\": 43,\n", + " \"num_beams\": 1,\n", + " \"do_sample\": true,\n", + " \"do_early_stopping\": true,\n", + " \"typical_p\": 0.8,\n", + " \"temperature\": null\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"notifications\": null,\n", + " \"label_predictors\": null\n", + " },\n", + " \"autouse_config\": null,\n", + " \"autouse_handler_id\": null,\n", + " \"auth_source\": \"grtu\"\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.\n", + "INFO: Status is active. A worker has started creating your model!\n", + "2023-10-19T23:17:19.117521Z Resolved revision for model\n", + "{\n", + " \"revision\": \"4874b16751ab2db177bbb898bd4a1e44c89e2f25\",\n", + " \"model\": \"gretelai/mpt-7b\"\n", + "}\n", + "2023-10-19T23:17:19.118032Z Parameter efficient fine tuning (PEFT) methods will be used, which greatly reduce the number of trainable parameters.\n", + "2023-10-19T23:17:19.119751Z Starting GPT model training...\n", + "{\n", + " \"num_train_steps\": 750\n", + "}\n", + "2023-10-19T23:17:19.120088Z Fine-tuning 'gretelai/mpt-7b' with provided dataset!\n", + "2023-10-19T23:17:19.120323Z Disclaimer: the chosen model may produce untrue and/or offensive content without warning. For more info, see https://docs.gretel.ai/reference/synthetics/models/gretel-gpt#limitations-and-biases\n", + "2023-10-19T23:17:19.120688Z Downloading model from remote source. Depending on the size of the model, this may take a few minutes.\n", + "2023-10-19T23:18:19.121248Z Model download 72% complete, ETA 23s (9566455971/13300877750 bytes downloaded)\n", + "2023-10-19T23:18:47.602336Z Model download 100% complete (13300877750 bytes downloaded). Loading model onto GPU ...\n", + "2023-10-19T23:19:19.122637Z Still loading model ...\n", + "2023-10-19T23:20:19.127702Z Still loading model ...\n", + "2023-10-19T23:21:19.128373Z Still loading model ...\n", + "2023-10-19T23:21:33.409504Z Successfully loaded model and tokenizer.\n", + "2023-10-19T23:21:33.411805Z PEFT trainable params: 4194304 || all params: 6653480960 || trainable%: 0.0630392425441013\n", + "2023-10-19T23:22:37.786077Z Training in progress, 12.8% complete (step 96/750, ETA 433s)\n", + "{\n", + " \"loss\": 2.9354,\n", + " \"learning_rate\": 2.4e-05,\n", + " \"epoch\": 0.38,\n", + " \"step\": 96\n", + "}\n", + "2023-10-19T23:23:38.862382Z Training in progress, 25.6% complete (step 192/750, ETA 362s)\n", + "{\n", + " \"loss\": 2.7882,\n", + " \"learning_rate\": 4.8e-05,\n", + " \"epoch\": 0.77,\n", + " \"step\": 192\n", + "}\n", + "2023-10-19T23:24:15.833618Z Training in progress, 33.1% complete (step 248/750, ETA 327s)\n", + "{\n", + " \"loss\": 2.5431,\n", + " \"learning_rate\": 6.2e-05,\n", + " \"epoch\": 0.99,\n", + " \"step\": 248\n", + "}\n", + "2023-10-19T23:25:20.519866Z Training in progress, 46.9% complete (step 352/750, ETA 256s)\n", + "{\n", + " \"loss\": 2.0183,\n", + " \"learning_rate\": 8.800000000000001e-05,\n", + " \"epoch\": 1.41,\n", + " \"step\": 352\n", + "}\n", + "2023-10-19T23:26:26.105701Z Training in progress, 60.8% complete (step 456/750, ETA 188s)\n", + "{\n", + " \"loss\": 1.7245,\n", + " \"learning_rate\": 0.00011399999999999999,\n", + " \"epoch\": 1.82,\n", + " \"step\": 456\n", + "}\n", + "2023-10-19T23:26:53.968722Z Training in progress, 66.1% complete (step 496/750, ETA 164s)\n", + "{\n", + " \"loss\": 1.8687,\n", + " \"learning_rate\": 0.000124,\n", + " \"epoch\": 1.98,\n", + " \"step\": 496\n", + "}\n", + "2023-10-19T23:27:56.653214Z Training in progress, 80.0% complete (step 600/750, ETA 96s)\n", + "{\n", + " \"loss\": 1.5899,\n", + " \"learning_rate\": 0.00015000000000000001,\n", + " \"epoch\": 2.4,\n", + " \"step\": 600\n", + "}\n", + "2023-10-19T23:29:02.517859Z Training in progress, 93.9% complete (step 704/750, ETA 29s)\n", + "{\n", + " \"loss\": 1.6418,\n", + " \"learning_rate\": 0.00017600000000000002,\n", + " \"epoch\": 2.82,\n", + " \"step\": 704\n", + "}\n", + "2023-10-19T23:29:32.185674Z Training in progress, 99.2% complete (step 744/750, ETA 4s)\n", + "{\n", + " \"loss\": 1.8231,\n", + " \"learning_rate\": 0.00018600000000000002,\n", + " \"epoch\": 2.98,\n", + " \"step\": 744\n", + "}\n", + "2023-10-19T23:29:33.409858Z Training in progress, 100.0% complete (step 750/750, ETA 0s)\n", + "{\n", + " \"loss\": 0.4237,\n", + " \"learning_rate\": 0.000188,\n", + " \"epoch\": 3.01,\n", + " \"step\": 750\n", + "}\n", + "2023-10-19T23:29:33.410370Z Training in progress, 100.0% complete (step 750/750, ETA 0s)\n", + "{\n", + " \"train_runtime\": 479.1431,\n", + " \"train_samples_per_second\": 6.278,\n", + " \"train_steps_per_second\": 0.196,\n", + " \"train_loss\": 2.085493191759637,\n", + " \"epoch\": 3.01,\n", + " \"step\": 750\n", + "}\n", + "2023-10-19T23:29:33.411001Z Training is completed!\n", + "2023-10-19T23:29:33.411405Z GPT model training complete.\n", + "2023-10-19T23:29:33.411854Z Saving model\n", + "2023-10-19T23:29:33.465436Z Sampling 80 records using auto prompting.\n", + "2023-10-19T23:29:33.467592Z Using device 'cuda'\n", + "2023-10-19T23:29:33.781277Z Generating records...\n", + "{\n", + " \"num_records\": 80\n", + "}\n", + "2023-10-19T23:30:48.033720Z [54/80] records complete.\n", + "{\n", + " \"current_valid_count\": 54,\n", + " \"total\": 80\n", + "}\n", + "2023-10-19T23:31:21.134597Z Successfully generated 80 records\n", + "2023-10-19T23:31:21.147797Z Creating Synthetic Text Data Quality Report...\n", + "2023-10-19T23:31:21.148132Z Creating text metrics report...\n", + "2023-10-19T23:31:32.951627Z Finished creating text metrics report.\n", + "2023-10-19T23:31:32.978093Z Synthetic Text Data Quality Report finished, exporting report artifacts...\n", + "2023-10-19T23:31:32.979116Z Model has been created successfully\n", + "2023-10-19T23:31:35.162769Z Uploading artifacts to Gretel Cloud...\n", + "2023-10-19T23:31:38.343704Z Upload to Gretel Cloud is completed.\n" + ] + } + ], + "source": [ + "from gretel_client.projects.models import read_model_config\n", + "\n", + "\n", + "\n", + "config = read_model_config(\"synthetics/natural-language\")\n", + "config[\"models\"][0][\"gpt_x\"][\"steps\"] = 600 #set different step values.\n", + "\n", + "# Designate project\n", + "PROJECT = 'data-summarization'\n", + "project = create_or_get_unique_project(name=PROJECT)\n", + "\n", + "# Create and submit model\n", + "model = project.create_model_obj(model_config=config, data_source=df)\n", + "model.name = f\"{PROJECT}-llama-2-7b\"\n", + "model.submit_cloud()\n", + "\n", + "poll(model)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lh4-8dddoTWb" + }, + "source": [ + "## Generate Text Synthetic Quality Score:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UsOXO4YPoSA1", + "outputId": "1c98264a-02ab-434e-b3f0-a57650b3935e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'summary': [{'field': 'synthetic_data_quality_score', 'value': 81},\n", + " {'field': 'semantic_similarity', 'value': 91},\n", + " {'field': 'structure_similarity', 'value': 55}]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.get_report_summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "DCSzW8N-Tf9S", + "outputId": "1b0a1431-1d91-481f-cbd7-6d9c5ccd90e2" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + " Gretel Synthetic Text Data Quality Report\n", + " \n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
Synthetic Text Data Quality Report
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
Model
\n", + "
GptX Model
\n", + "
\n", + " Model UID \n", + " 6531b8e8f3bf601ba821bc39\n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " Project\n", + " data-summarization\n", + "
\n", + "
\n", + " Generated\n", + " 10/19/2023, 23:17\n", + "
\n", + "
\n", + "
\n", + " \n", + " Excellent\n", + " \n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " Synthetic Text Data Quality Score\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\t \n", + " \n", + "\n", + "
\n", + "
\n", + " \n", + " The Synthetic Text Data Quality Score is computed by taking a weighted\n", + " combination of the individual quality metrics: Text Semantic Similarity\n", + " and Text Structure Similarity. The report supports 50+ languages, including:\n", + " English, French, German, Dutch, Italian, Portuguese, Spanish, Russian,\n", + " Polish, Arabic, Turkish, Chinese, Japanese, Thai and Korean.\n", + " \n", + " Learn more.\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "

Data Summary Statistics

\n", + "\t
\n", + "\t\t
\n", + "\t\t
\n", + "\t\t \n", + "\t\t\t \n", + "\t\t\t Excellent\n", + "\t\t\t \n", + "\t\t \n", + "\t\t\t \n", + "\t\t\t
\n", + "\t\t\t
\n", + "
\n", + "\t\t\t
\n", + "\t\t\tText Semantic Similarity\n", + "\t\t
\n", + "\t\t
\n", + "\t\t
\n", + "\t\t \n", + "\t\t\t \n", + "\t\t\t Moderate\n", + "\t\t\t \n", + "\t\t \n", + "\t\t\t \n", + "\t\t\t
\n", + "\t\t\t
\n", + "
\n", + "\t\t\t
\n", + "\t\t\tText Structure Similarity\n", + "\t\t
\n", + "\t
\n", + "\n", + "\t
\n", + "\t\t\n", + "\t\t\n", + "\t
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Training DataSynthetic Data
Row Count8080
Column Count11
Training Lines Duplicated-0
Missing Values00
Unique Values8080
Average Words Per Sentence4.484.14
Average Characters Per Word4.173.91
Average Sentence Count8.9010.11
\n", + "

\n", + " What do these values mean?\n", + "

\n", + " \n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "

Semantic Similarity Principal Component Analysis 

\n", + " \n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "

Text Structure Similarity 

\n", + " \n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + "

\n", + " Copyright © 2023 Gretel Labs, Inc. All rights reserved.\n", + "

\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": { + "text/html": { + "isolated": true + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "#Plot the text SQS report:\n", + "import IPython\n", + "from smart_open import open\n", + "\n", + "IPython.display.HTML(data=open(model.get_artifact_link(\"text_metrics_report\")).read(), metadata=dict(isolated=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J_yIE4WrW1Je" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 0596495111127c32b4037749fe15d29ec08f7d59 Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Fri, 20 Oct 2023 00:33:32 +0000 Subject: [PATCH 02/11] changed end point to prod --- .../Text-Summerization-gptipynb.ipynb | 1778 +---------------- 1 file changed, 6 insertions(+), 1772 deletions(-) diff --git a/docs/notebooks/Text-Summerization-gptipynb.ipynb b/docs/notebooks/Text-Summerization-gptipynb.ipynb index a8146f28..0c4cbea0 100644 --- a/docs/notebooks/Text-Summerization-gptipynb.ipynb +++ b/docs/notebooks/Text-Summerization-gptipynb.ipynb @@ -58,20 +58,10 @@ "id": "mlQIp_uGTTgo", "outputId": "e27df360-d8b3-46ea-df4a-4244f6c3373a" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Caching Gretel config to disk.\n", - "Using endpoint https://api-dev.gretel.cloud\n", - "Logged in as marjan@gretel.ai ✅\n" - ] - } - ], + "outputs": [], "source": [ "# Log into Gretel\n", - "configure_session(api_key=\"prompt\", cache=\"yes\", endpoint=\"https://api-dev.gretel.cloud\", validate=True, clear=True)\n", + "configure_session(api_key=\"prompt\", cache=\"yes\", endpoint=\"https://api.gretel.cloud\", validate=True, clear=True)\n", "\n", "pd.set_option('max_colwidth', None)" ] @@ -96,280 +86,7 @@ "id": "_QyG3jfRh2-i", "outputId": "a60c1c7e-b71e-4843-cfb4-784000730546" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dialogue_and_summary
0**dialogue**\\nLeon: I asked you to lend me your camera\\r\\nItzel: You can come and take it from my home\\r\\nLeon: Would be there in an hour\\n\\n**summary**\\nLeon will borrow Itzel's camera.
1**dialogue**\\nJayleen: I'm dyeing my hair\\r\\nAugust: What colour?\\r\\nJayleen: I'm staying with my blonde. I had to refresh my colour\\r\\nAugust: Ok\\r\\nJayleen: I haven't dyed my hair for around 9 months xd\\n\\n**summary**\\nJayleen is dyeing her hair blonde for the first time in 9 months.
2**dialogue**\\nLiam: I don't think the institutional approach is too interesting\\nJeff: I agree...\\nTom: so let's try to find an alternative\\n\\n**summary**\\nLiam and Jeff do not find institutional approach interesting.
3**dialogue**\\nGina: <file_photo>\\r\\nGina: What do you think?\\r\\nKate: Grab it! At that price it is an absolute bargain.\\n\\n**summary**\\nKate wants Gina to buy it because it's cheap.
4**dialogue**\\nAndrew: Hello Janny, is it still convenient for us to come and check your gas meter at 2.45 today?\\r\\nJanny: Hi Andrew, that's fine. \\r\\nAndrew: Thank you, we will see you then\\n\\n**summary**\\nAndrew will come to Janny to check her gas meter at 2.45 today.
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ], - "text/plain": [ - " dialogue_and_summary\n", - "0 **dialogue**\\nLeon: I asked you to lend me your camera\\r\\nItzel: You can come and take it from my home\\r\\nLeon: Would be there in an hour\\n\\n**summary**\\nLeon will borrow Itzel's camera. \n", - "1 **dialogue**\\nJayleen: I'm dyeing my hair\\r\\nAugust: What colour?\\r\\nJayleen: I'm staying with my blonde. I had to refresh my colour\\r\\nAugust: Ok\\r\\nJayleen: I haven't dyed my hair for around 9 months xd\\n\\n**summary**\\nJayleen is dyeing her hair blonde for the first time in 9 months.\n", - "2 **dialogue**\\nLiam: I don't think the institutional approach is too interesting\\nJeff: I agree...\\nTom: so let's try to find an alternative\\n\\n**summary**\\nLiam and Jeff do not find institutional approach interesting.\n", - "3 **dialogue**\\nGina: \\r\\nGina: What do you think?\\r\\nKate: Grab it! At that price it is an absolute bargain.\\n\\n**summary**\\nKate wants Gina to buy it because it's cheap. \n", - "4 **dialogue**\\nAndrew: Hello Janny, is it still convenient for us to come and check your gas meter at 2.45 today?\\r\\nJanny: Hi Andrew, that's fine. \\r\\nAndrew: Thank you, we will see you then\\n\\n**summary**\\nAndrew will come to Janny to check her gas meter at 2.45 today." - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Specify a dataset to train on\n", "DATASET_PATH = 'https://gretel-datasets.s3.us-west-2.amazonaws.com/Text-dataset/Samsum-text-summerization-sample-1000.csv'\n", @@ -399,224 +116,7 @@ "id": "HpjRvCmjU5qG", "outputId": "2bd9311d-d749-4431-a679-fd2a8cc69e27" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO: Starting poller\n", - "INFO: Status is created. Model creation has been queued.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"uid\": \"6531b8e8f3bf601ba821bc39\",\n", - " \"guid\": \"model_2X0E5sAljm7RZuBoGNYOhziDYM3\",\n", - " \"model_name\": \"data-summarization-llama-2-7b\",\n", - " \"runner_mode\": \"cloud\",\n", - " \"user_id\": \"621e70bf492fbf0535537ea1\",\n", - " \"user_guid\": \"user_25nTzH09cLJdsemHxZVO2SdOc8u\",\n", - " \"billing_domain\": \"gretel.ai\",\n", - " \"billing_domain_guid\": \"domain_28bzIokk1eQdWUYsovba0VN1gtY\",\n", - " \"project_id\": \"6531b8e7478d822b693c19f6\",\n", - " \"project_guid\": \"proj_2X0E5l1tcgBRbcLBtBwtHNYVeYH\",\n", - " \"status_history\": {\n", - " \"created\": \"2023-10-19T23:16:56.300401Z\"\n", - " },\n", - " \"last_modified\": \"2023-10-19T23:16:56.408869Z\",\n", - " \"status\": \"created\",\n", - " \"last_active_hb\": null,\n", - " \"duration_minutes\": null,\n", - " \"error_msg\": null,\n", - " \"error_id\": null,\n", - " \"traceback\": null,\n", - " \"annotations\": null,\n", - " \"provenance\": null,\n", - " \"container_image\": \"074762682575.dkr.ecr.us-east-2.amazonaws.com/models/gpt_x@sha256:28ab363cd8f7687570a8c8470d3e2c4391b5b31ee64f3ef05970d3e8943c2d6e\",\n", - " \"container_image_version\": \"6eb73a3b\",\n", - " \"model_type\": \"gpt_x\",\n", - " \"model_type_alias\": null,\n", - " \"project_name\": \"data-summarization\",\n", - " \"config\": {\n", - " \"schema_version\": \"1.0\",\n", - " \"name\": \"data-summarization-llama-2-7b\",\n", - " \"models\": [\n", - " {\n", - " \"gpt_x\": {\n", - " \"data_source\": [\n", - " \"gretel_450b7de87b3145d4a365c75ec5848065_dataframe-8e6ff378-1c8c-4654-917f-9a74ba805d8e.csv\"\n", - " ],\n", - " \"ref_data\": {},\n", - " \"pretrained_model\": \"gretelai/mpt-7b\",\n", - " \"column_name\": null,\n", - " \"validation\": null,\n", - " \"params\": {\n", - " \"batch_size\": 4,\n", - " \"epochs\": null,\n", - " \"steps\": 750,\n", - " \"weight_decay\": 0.01,\n", - " \"warmup_steps\": 100,\n", - " \"lr_scheduler\": \"linear\",\n", - " \"learning_rate\": 0.0002,\n", - " \"max_tokens\": 512\n", - " },\n", - " \"generate\": {\n", - " \"num_records\": 80,\n", - " \"seed_records_multiplier\": 1,\n", - " \"maximum_text_length\": 100,\n", - " \"top_p\": 0.8987601335810778,\n", - " \"top_k\": 43,\n", - " \"num_beams\": 1,\n", - " \"do_sample\": true,\n", - " \"do_early_stopping\": true,\n", - " \"typical_p\": 0.8,\n", - " \"temperature\": null\n", - " }\n", - " }\n", - " }\n", - " ],\n", - " \"notifications\": null,\n", - " \"label_predictors\": null\n", - " },\n", - " \"autouse_config\": null,\n", - " \"autouse_handler_id\": null,\n", - " \"auth_source\": \"grtu\"\n", - "}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.\n", - "INFO: Status is active. A worker has started creating your model!\n", - "2023-10-19T23:17:19.117521Z Resolved revision for model\n", - "{\n", - " \"revision\": \"4874b16751ab2db177bbb898bd4a1e44c89e2f25\",\n", - " \"model\": \"gretelai/mpt-7b\"\n", - "}\n", - "2023-10-19T23:17:19.118032Z Parameter efficient fine tuning (PEFT) methods will be used, which greatly reduce the number of trainable parameters.\n", - "2023-10-19T23:17:19.119751Z Starting GPT model training...\n", - "{\n", - " \"num_train_steps\": 750\n", - "}\n", - "2023-10-19T23:17:19.120088Z Fine-tuning 'gretelai/mpt-7b' with provided dataset!\n", - "2023-10-19T23:17:19.120323Z Disclaimer: the chosen model may produce untrue and/or offensive content without warning. For more info, see https://docs.gretel.ai/reference/synthetics/models/gretel-gpt#limitations-and-biases\n", - "2023-10-19T23:17:19.120688Z Downloading model from remote source. Depending on the size of the model, this may take a few minutes.\n", - "2023-10-19T23:18:19.121248Z Model download 72% complete, ETA 23s (9566455971/13300877750 bytes downloaded)\n", - "2023-10-19T23:18:47.602336Z Model download 100% complete (13300877750 bytes downloaded). Loading model onto GPU ...\n", - "2023-10-19T23:19:19.122637Z Still loading model ...\n", - "2023-10-19T23:20:19.127702Z Still loading model ...\n", - "2023-10-19T23:21:19.128373Z Still loading model ...\n", - "2023-10-19T23:21:33.409504Z Successfully loaded model and tokenizer.\n", - "2023-10-19T23:21:33.411805Z PEFT trainable params: 4194304 || all params: 6653480960 || trainable%: 0.0630392425441013\n", - "2023-10-19T23:22:37.786077Z Training in progress, 12.8% complete (step 96/750, ETA 433s)\n", - "{\n", - " \"loss\": 2.9354,\n", - " \"learning_rate\": 2.4e-05,\n", - " \"epoch\": 0.38,\n", - " \"step\": 96\n", - "}\n", - "2023-10-19T23:23:38.862382Z Training in progress, 25.6% complete (step 192/750, ETA 362s)\n", - "{\n", - " \"loss\": 2.7882,\n", - " \"learning_rate\": 4.8e-05,\n", - " \"epoch\": 0.77,\n", - " \"step\": 192\n", - "}\n", - "2023-10-19T23:24:15.833618Z Training in progress, 33.1% complete (step 248/750, ETA 327s)\n", - "{\n", - " \"loss\": 2.5431,\n", - " \"learning_rate\": 6.2e-05,\n", - " \"epoch\": 0.99,\n", - " \"step\": 248\n", - "}\n", - "2023-10-19T23:25:20.519866Z Training in progress, 46.9% complete (step 352/750, ETA 256s)\n", - "{\n", - " \"loss\": 2.0183,\n", - " \"learning_rate\": 8.800000000000001e-05,\n", - " \"epoch\": 1.41,\n", - " \"step\": 352\n", - "}\n", - "2023-10-19T23:26:26.105701Z Training in progress, 60.8% complete (step 456/750, ETA 188s)\n", - "{\n", - " \"loss\": 1.7245,\n", - " \"learning_rate\": 0.00011399999999999999,\n", - " \"epoch\": 1.82,\n", - " \"step\": 456\n", - "}\n", - "2023-10-19T23:26:53.968722Z Training in progress, 66.1% complete (step 496/750, ETA 164s)\n", - "{\n", - " \"loss\": 1.8687,\n", - " \"learning_rate\": 0.000124,\n", - " \"epoch\": 1.98,\n", - " \"step\": 496\n", - "}\n", - "2023-10-19T23:27:56.653214Z Training in progress, 80.0% complete (step 600/750, ETA 96s)\n", - "{\n", - " \"loss\": 1.5899,\n", - " \"learning_rate\": 0.00015000000000000001,\n", - " \"epoch\": 2.4,\n", - " \"step\": 600\n", - "}\n", - "2023-10-19T23:29:02.517859Z Training in progress, 93.9% complete (step 704/750, ETA 29s)\n", - "{\n", - " \"loss\": 1.6418,\n", - " \"learning_rate\": 0.00017600000000000002,\n", - " \"epoch\": 2.82,\n", - " \"step\": 704\n", - "}\n", - "2023-10-19T23:29:32.185674Z Training in progress, 99.2% complete (step 744/750, ETA 4s)\n", - "{\n", - " \"loss\": 1.8231,\n", - " \"learning_rate\": 0.00018600000000000002,\n", - " \"epoch\": 2.98,\n", - " \"step\": 744\n", - "}\n", - "2023-10-19T23:29:33.409858Z Training in progress, 100.0% complete (step 750/750, ETA 0s)\n", - "{\n", - " \"loss\": 0.4237,\n", - " \"learning_rate\": 0.000188,\n", - " \"epoch\": 3.01,\n", - " \"step\": 750\n", - "}\n", - "2023-10-19T23:29:33.410370Z Training in progress, 100.0% complete (step 750/750, ETA 0s)\n", - "{\n", - " \"train_runtime\": 479.1431,\n", - " \"train_samples_per_second\": 6.278,\n", - " \"train_steps_per_second\": 0.196,\n", - " \"train_loss\": 2.085493191759637,\n", - " \"epoch\": 3.01,\n", - " \"step\": 750\n", - "}\n", - "2023-10-19T23:29:33.411001Z Training is completed!\n", - "2023-10-19T23:29:33.411405Z GPT model training complete.\n", - "2023-10-19T23:29:33.411854Z Saving model\n", - "2023-10-19T23:29:33.465436Z Sampling 80 records using auto prompting.\n", - "2023-10-19T23:29:33.467592Z Using device 'cuda'\n", - "2023-10-19T23:29:33.781277Z Generating records...\n", - "{\n", - " \"num_records\": 80\n", - "}\n", - "2023-10-19T23:30:48.033720Z [54/80] records complete.\n", - "{\n", - " \"current_valid_count\": 54,\n", - " \"total\": 80\n", - "}\n", - "2023-10-19T23:31:21.134597Z Successfully generated 80 records\n", - "2023-10-19T23:31:21.147797Z Creating Synthetic Text Data Quality Report...\n", - "2023-10-19T23:31:21.148132Z Creating text metrics report...\n", - "2023-10-19T23:31:32.951627Z Finished creating text metrics report.\n", - "2023-10-19T23:31:32.978093Z Synthetic Text Data Quality Report finished, exporting report artifacts...\n", - "2023-10-19T23:31:32.979116Z Model has been created successfully\n", - "2023-10-19T23:31:35.162769Z Uploading artifacts to Gretel Cloud...\n", - "2023-10-19T23:31:38.343704Z Upload to Gretel Cloud is completed.\n" - ] - } - ], + "outputs": [], "source": [ "from gretel_client.projects.models import read_model_config\n", "\n", @@ -656,20 +156,7 @@ "id": "UsOXO4YPoSA1", "outputId": "1c98264a-02ab-434e-b3f0-a57650b3935e" }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'summary': [{'field': 'synthetic_data_quality_score', 'value': 81},\n", - " {'field': 'semantic_similarity', 'value': 91},\n", - " {'field': 'structure_similarity', 'value': 55}]}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model.get_report_summary()" ] @@ -685,1260 +172,7 @@ "id": "DCSzW8N-Tf9S", "outputId": "1b0a1431-1d91-481f-cbd7-6d9c5ccd90e2" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - " \n", - " Gretel Synthetic Text Data Quality Report\n", - " \n", - " \n", - "\n", - "\n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
Synthetic Text Data Quality Report
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
Model
\n", - "
GptX Model
\n", - "
\n", - " Model UID \n", - " 6531b8e8f3bf601ba821bc39\n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " Project\n", - " data-summarization\n", - "
\n", - "
\n", - " Generated\n", - " 10/19/2023, 23:17\n", - "
\n", - "
\n", - "
\n", - " \n", - " Excellent\n", - " \n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - " Synthetic Text Data Quality Score\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\t \n", - " \n", - "\n", - "
\n", - "
\n", - " \n", - " The Synthetic Text Data Quality Score is computed by taking a weighted\n", - " combination of the individual quality metrics: Text Semantic Similarity\n", - " and Text Structure Similarity. The report supports 50+ languages, including:\n", - " English, French, German, Dutch, Italian, Portuguese, Spanish, Russian,\n", - " Polish, Arabic, Turkish, Chinese, Japanese, Thai and Korean.\n", - " \n", - " Learn more.\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "

Data Summary Statistics

\n", - "\t
\n", - "\t\t
\n", - "\t\t
\n", - "\t\t \n", - "\t\t\t \n", - "\t\t\t Excellent\n", - "\t\t\t \n", - "\t\t \n", - "\t\t\t \n", - "\t\t\t
\n", - "\t\t\t
\n", - "
\n", - "\t\t\t
\n", - "\t\t\tText Semantic Similarity\n", - "\t\t
\n", - "\t\t
\n", - "\t\t
\n", - "\t\t \n", - "\t\t\t \n", - "\t\t\t Moderate\n", - "\t\t\t \n", - "\t\t \n", - "\t\t\t \n", - "\t\t\t
\n", - "\t\t\t
\n", - "
\n", - "\t\t\t
\n", - "\t\t\tText Structure Similarity\n", - "\t\t
\n", - "\t
\n", - "\n", - "\t
\n", - "\t\t\n", - "\t\t\n", - "\t
\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Training DataSynthetic Data
Row Count8080
Column Count11
Training Lines Duplicated-0
Missing Values00
Unique Values8080
Average Words Per Sentence4.484.14
Average Characters Per Word4.173.91
Average Sentence Count8.9010.11
\n", - "

\n", - " What do these values mean?\n", - "

\n", - " \n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "

Semantic Similarity Principal Component Analysis 

\n", - " \n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "

Text Structure Similarity 

\n", - " \n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - "

\n", - " Copyright © 2023 Gretel Labs, Inc. All rights reserved.\n", - "

\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": { - "text/html": { - "isolated": true - } - }, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "#Plot the text SQS report:\n", "import IPython\n", From d0ea36b1474da26b7a5d7d0e4290ce25d4d6a90f Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Fri, 20 Oct 2023 18:36:45 +0000 Subject: [PATCH 03/11] added a seperate folder for the notebooks called in blogs --- docs/notebooks/{ => blog}/Text-Summerization-gptipynb.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/notebooks/{ => blog}/Text-Summerization-gptipynb.ipynb (100%) diff --git a/docs/notebooks/Text-Summerization-gptipynb.ipynb b/docs/notebooks/blog/Text-Summerization-gptipynb.ipynb similarity index 100% rename from docs/notebooks/Text-Summerization-gptipynb.ipynb rename to docs/notebooks/blog/Text-Summerization-gptipynb.ipynb From e0468bc34e5d202b7d7b79169b2bf539dbf60519 Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Fri, 20 Oct 2023 18:38:53 +0000 Subject: [PATCH 04/11] update notebook name --- ...-Summerization-gptipynb.ipynb => Text-Summerization-gpt.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/notebooks/blog/{Text-Summerization-gptipynb.ipynb => Text-Summerization-gpt.ipynb} (100%) diff --git a/docs/notebooks/blog/Text-Summerization-gptipynb.ipynb b/docs/notebooks/blog/Text-Summerization-gpt.ipynb similarity index 100% rename from docs/notebooks/blog/Text-Summerization-gptipynb.ipynb rename to docs/notebooks/blog/Text-Summerization-gpt.ipynb From 58244a87da53ad27f34c42ffd60e950615689abc Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Tue, 24 Oct 2023 18:07:43 +0000 Subject: [PATCH 05/11] changed folder name to content --- docs/notebooks/{blog => content}/Text-Summerization-gpt.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/notebooks/{blog => content}/Text-Summerization-gpt.ipynb (100%) diff --git a/docs/notebooks/blog/Text-Summerization-gpt.ipynb b/docs/notebooks/content/Text-Summerization-gpt.ipynb similarity index 100% rename from docs/notebooks/blog/Text-Summerization-gpt.ipynb rename to docs/notebooks/content/Text-Summerization-gpt.ipynb From e3bec549a3018eec0fa2c576891fcca69124a3f1 Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Wed, 25 Oct 2023 16:50:29 +0000 Subject: [PATCH 06/11] change notebook name to the snake case --- ...{Text-Summerization-gpt.ipynb => text_summerization_gpt.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/notebooks/content/{Text-Summerization-gpt.ipynb => text_summerization_gpt.ipynb} (100%) diff --git a/docs/notebooks/content/Text-Summerization-gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb similarity index 100% rename from docs/notebooks/content/Text-Summerization-gpt.ipynb rename to docs/notebooks/content/text_summerization_gpt.ipynb From 3b14ac153c69d4af002ccd35c8614b6e6ec0739c Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Wed, 25 Oct 2023 16:53:38 +0000 Subject: [PATCH 07/11] update the link to the API key --- docs/notebooks/content/text_summerization_gpt.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb index 0c4cbea0..41f3c55b 100644 --- a/docs/notebooks/content/text_summerization_gpt.ipynb +++ b/docs/notebooks/content/text_summerization_gpt.ipynb @@ -9,7 +9,7 @@ "# Generate Synthetic text summarization with Gretel GPT\n", "\n", "* In this notebook we use Gretel GPT with Llama-2 7b model to create synthetic text summerization dataset. \n", - "* To run this notebook, you will need an API key from the [Gretel Console](https://console.gretel.ai/)." + "* To run this notebook, you will need an API key from the [Gretel Console](https://console.gretel.ai/users/me/key/)." ] }, { From 62b9cb769b43f5113900c74c212fbd96915f21fa Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Wed, 25 Oct 2023 17:02:43 +0000 Subject: [PATCH 08/11] update steps for slightly better results --- docs/notebooks/content/text_summerization_gpt.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb index 41f3c55b..5180897a 100644 --- a/docs/notebooks/content/text_summerization_gpt.ipynb +++ b/docs/notebooks/content/text_summerization_gpt.ipynb @@ -123,7 +123,7 @@ "\n", "\n", "config = read_model_config(\"synthetics/natural-language\")\n", - "config[\"models\"][0][\"gpt_x\"][\"steps\"] = 600 #set different step values.\n", + "config[\"models\"][0][\"gpt_x\"][\"steps\"] = 1000 #set different step values.\n", "\n", "# Designate project\n", "PROJECT = 'data-summarization'\n", From 1eb0df3834d618ffa2c1da81633e0355002bd55b Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Wed, 25 Oct 2023 17:04:30 +0000 Subject: [PATCH 09/11] addressed feedback comments --- docs/notebooks/content/text_summerization_gpt.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb index 5180897a..35a9811c 100644 --- a/docs/notebooks/content/text_summerization_gpt.ipynb +++ b/docs/notebooks/content/text_summerization_gpt.ipynb @@ -143,7 +143,7 @@ "id": "Lh4-8dddoTWb" }, "source": [ - "## Generate Text Synthetic Quality Score:" + "## Display Text Synthetic Quality Score:" ] }, { From edd040a861416c1bda0853c4cc9a932e8007e38b Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Wed, 25 Oct 2023 23:48:37 +0000 Subject: [PATCH 10/11] updated the notebook to the new SDK interface --- .../content/text_summerization_gpt.ipynb | 95 ++++++------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb index 35a9811c..f4ef54d2 100644 --- a/docs/notebooks/content/text_summerization_gpt.ipynb +++ b/docs/notebooks/content/text_summerization_gpt.ipynb @@ -41,29 +41,9 @@ }, "outputs": [], "source": [ + "#import required packages\n", "import pandas as pd\n", - "\n", - "from gretel_client import configure_session\n", - "from gretel_client.helpers import poll\n", - "from gretel_client.projects import create_or_get_unique_project, get_project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mlQIp_uGTTgo", - "outputId": "e27df360-d8b3-46ea-df4a-4244f6c3373a" - }, - "outputs": [], - "source": [ - "# Log into Gretel\n", - "configure_session(api_key=\"prompt\", cache=\"yes\", endpoint=\"https://api.gretel.cloud\", validate=True, clear=True)\n", - "\n", - "pd.set_option('max_colwidth', None)" + "from gretel_client import Gretel" ] }, { @@ -88,18 +68,19 @@ }, "outputs": [], "source": [ + "pd.set_option('max_colwidth', None)\n", + "\n", "# Specify a dataset to train on\n", "DATASET_PATH = 'https://gretel-datasets.s3.us-west-2.amazonaws.com/Text-dataset/Samsum-text-summerization-sample-1000.csv'\n", "df = pd.read_csv(DATASET_PATH)\n", "\n", + "#Let's look at the training dataset:\n", "df.head()" ] }, { "cell_type": "markdown", - "metadata": { - "id": "4PD5B0U06ALs" - }, + "metadata": {}, "source": [ "## Configure and Train the Synthetic Model:\n", "\n", @@ -109,32 +90,22 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HpjRvCmjU5qG", - "outputId": "2bd9311d-d749-4431-a679-fd2a8cc69e27" - }, + "metadata": {}, "outputs": [], "source": [ - "from gretel_client.projects.models import read_model_config\n", - "\n", "\n", "\n", - "config = read_model_config(\"synthetics/natural-language\")\n", - "config[\"models\"][0][\"gpt_x\"][\"steps\"] = 1000 #set different step values.\n", - "\n", - "# Designate project\n", "PROJECT = 'data-summarization'\n", - "project = create_or_get_unique_project(name=PROJECT)\n", + "LLM = \"meta-llama/Llama-2-7b-hf\"\n", "\n", - "# Create and submit model\n", - "model = project.create_model_obj(model_config=config, data_source=df)\n", - "model.name = f\"{PROJECT}-llama-2-7b\"\n", - "model.submit_cloud()\n", + "gretel = Gretel(project_name=f\"{PROJECT}-llama-2-7b\", api_key=\"prompt\", validate=True)\n", "\n", - "poll(model)\n" + "trained = gretel.submit_train(\n", + " \"natural-language\",\n", + " data_source=df,\n", + " pretrained_model=LLM,\n", + " params={\"steps\": 1000}, \n", + " )" ] }, { @@ -149,36 +120,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UsOXO4YPoSA1", - "outputId": "1c98264a-02ab-434e-b3f0-a57650b3935e" - }, + "metadata": {}, "outputs": [], "source": [ - "model.get_report_summary()" + "trained.report.quality_scores" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "DCSzW8N-Tf9S", - "outputId": "1b0a1431-1d91-481f-cbd7-6d9c5ccd90e2" - }, + "metadata": {}, "outputs": [], "source": [ - "#Plot the text SQS report:\n", - "import IPython\n", - "from smart_open import open\n", - "\n", - "IPython.display.HTML(data=open(model.get_artifact_link(\"text_metrics_report\")).read(), metadata=dict(isolated=True))" + "trained.report.display_in_notebook()" ] }, { @@ -200,7 +154,16 @@ "name": "python3" }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } }, "nbformat": 4, From f474dbad86169621418fb67c43493a46ee597d57 Mon Sep 17 00:00:00 2001 From: marjan_emd Date: Thu, 26 Oct 2023 00:13:32 +0000 Subject: [PATCH 11/11] added the Open in Colab button. --- docs/notebooks/content/text_summerization_gpt.ipynb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb index f4ef54d2..c5bef792 100644 --- a/docs/notebooks/content/text_summerization_gpt.ipynb +++ b/docs/notebooks/content/text_summerization_gpt.ipynb @@ -5,6 +5,15 @@ "metadata": { "id": "4vXfYHX6QSJu" }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ "# Generate Synthetic text summarization with Gretel GPT\n", "\n",