gretelai · Marjan-emd · Oct 26, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/docs/notebooks/content/text_summerization_gpt.ipynb b/docs/notebooks/content/text_summerization_gpt.ipynb
@@ -0,0 +1,208 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4vXfYHX6QSJu"
+      },
+      "source": [
+        "# Generate Synthetic text summarization with Gretel GPT\n",
+        "\n",
+        "* In this notebook we use Gretel GPT with Llama-2 7b model to create synthetic text summerization dataset.  \n",
+        "* To run this notebook, you will need an API key from the [Gretel Console](https://console.gretel.ai/users/me/key/)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GhwZL2atTilv"
+      },
+      "source": [
+        "## Getting Started"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V_iIkqnUQK2l"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install -U gretel-client"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kixD67x_TSC4"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "from gretel_client import configure_session\n",
+        "from gretel_client.helpers import poll\n",
+        "from gretel_client.projects import create_or_get_unique_project, get_project"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mlQIp_uGTTgo",
+        "outputId": "e27df360-d8b3-46ea-df4a-4244f6c3373a"
+      },
+      "outputs": [],
+      "source": [
+        "# Log into Gretel\n",
+        "configure_session(api_key=\"prompt\", cache=\"yes\", endpoint=\"https://api.gretel.cloud\", validate=True, clear=True)\n",
+        "\n",
+        "pd.set_option('max_colwidth', None)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0pMwi0RghUzh"
+      },
+      "source": [
+        "## Load and preview training data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 234
+        },
+        "id": "_QyG3jfRh2-i",
+        "outputId": "a60c1c7e-b71e-4843-cfb4-784000730546"
+      },
+      "outputs": [],
+      "source": [
+        "# Specify a dataset to train on\n",
+        "DATASET_PATH = 'https://gretel-datasets.s3.us-west-2.amazonaws.com/Text-dataset/Samsum-text-summerization-sample-1000.csv'\n",
+        "df = pd.read_csv(DATASET_PATH)\n",
+        "\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4PD5B0U06ALs"
+      },
+      "source": [
+        "## Configure and Train the Synthetic Model:\n",
+        "\n",
+        "We can experiment different \"steps\" parameters which result in a change of text SQS."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HpjRvCmjU5qG",
+        "outputId": "2bd9311d-d749-4431-a679-fd2a8cc69e27"
+      },
+      "outputs": [],
+      "source": [
+        "from gretel_client.projects.models import read_model_config\n",
+        "\n",
+        "\n",
+        "\n",
+        "config = read_model_config(\"synthetics/natural-language\")\n",
+        "config[\"models\"][0][\"gpt_x\"][\"steps\"] = 1000 #set different step values.\n",
+        "\n",
+        "# Designate project\n",
+        "PROJECT = 'data-summarization'\n",
+        "project = create_or_get_unique_project(name=PROJECT)\n",
+        "\n",
+        "# Create and submit model\n",
+        "model = project.create_model_obj(model_config=config, data_source=df)\n",
+        "model.name = f\"{PROJECT}-llama-2-7b\"\n",
+        "model.submit_cloud()\n",
+        "\n",
+        "poll(model)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Lh4-8dddoTWb"
+      },
+      "source": [
+        "## Display Text Synthetic Quality Score:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "UsOXO4YPoSA1",
+        "outputId": "1c98264a-02ab-434e-b3f0-a57650b3935e"
+      },
+      "outputs": [],
+      "source": [
+        "model.get_report_summary()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "DCSzW8N-Tf9S",
+        "outputId": "1b0a1431-1d91-481f-cbd7-6d9c5ccd90e2"
+      },
+      "outputs": [],
+      "source": [
+        "#Plot the text SQS report:\n",
+        "import IPython\n",
+        "from smart_open import open\n",
+        "\n",
+        "IPython.display.HTML(data=open(model.get_artifact_link(\"text_metrics_report\")).read(), metadata=dict(isolated=True))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "J_yIE4WrW1Je"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}