gretelai · mvansegbroeck · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/examples/piidoc_pipeline/configs/auto_config.yml b/examples/piidoc_pipeline/configs/auto_config.yml
@@ -0,0 +1,9 @@
+code_lang: sql
+llm_suite_type: open_license
+
+num_domains: 10
+num_topics_per_domain: 10
+num_complexity_levels: 4
+
+llm_as_a_judge: true
+syntax_validation: true
diff --git a/examples/piidoc_pipeline/configs/llms/dev.yml b/examples/piidoc_pipeline/configs/llms/dev.yml
@@ -0,0 +1,11 @@
+- model_name: gretelai/gpt-mistral-nemo-2407
+  litellm_params:
+    model: openai/gretelai/gpt-mistral-nemo-2407
+    base_url: https://llmproxy.dev.gretel.cloud/v1/
+    extra_headers:
+      Authorization: os.environ/GRETEL_DEV_API_KEY
+  tags:
+  - open_license
+  - nl
+  - code
+  - judge
diff --git a/examples/piidoc_pipeline/configs/llms/local.yml b/examples/piidoc_pipeline/configs/llms/local.yml
@@ -0,0 +1,9 @@
+- model_name: local
+  litellm_params:
+    model: openai/local
+    base_url: http://localhost:8080/v1/
+  tags:
+  - open_license
+  - nl
+  - code
+  - judge
diff --git a/examples/piidoc_pipeline/configs/llms/prod.yml b/examples/piidoc_pipeline/configs/llms/prod.yml
@@ -0,0 +1,10 @@
+- model_name: gretelai-mistral-nemo-2407
+  litellm_params:
+    model: gretelai/gpt-mistral-nemo-2407
+    api_key: os.environ/GRETEL_PROD_API_KEY
+    api_base: https://api.gretel.ai
+  tags:
+  - open_license
+  - nl
+  - code
+  - judge
diff --git a/examples/piidoc_pipeline/configs/manual_config.yml b/examples/piidoc_pipeline/configs/manual_config.yml
@@ -0,0 +1,25 @@
+doc_lang: pii_doc
+llm_suite_type: open_license
+
+
+domain_and_doctypes: 
+    Healthcare: 
+        - Appointment Scheduling
+        - Patient Consent Form
+        - Medical History Form
+        - Insurance Claim
+        - Medical Bill Statement
+    Legal Documents: 
+        - Contract
+        - Contract
+        - Agreement
+        - Service Level Agreement
+        - Non-Disclosure Agreement
+    Financial Services:
+        - Loan Application Form
+        - Credit Report
+        - Insurance Policy
+        - Bank Statement
+        - Investment Statement
+
+entity_validation: true
diff --git a/examples/piidoc_pipeline/llm_config.yaml b/examples/piidoc_pipeline/llm_config.yaml
@@ -0,0 +1,10 @@
+- litellm_params:
+    api_base: https://api.gretel.ai
+    api_key: os.environ/GRETEL_PROD_API_KEY
+    model: gretelai/gpt-mistral-nemo-2407
+  model_name: gretelai-mistral-nemo-2407
+  tags:
+  - open_license
+  - nl
+  - judge
+  - code
diff --git a/examples/piidoc_pipeline/piidocs_generation.ipynb b/examples/piidoc_pipeline/piidocs_generation.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8e43db24-efe0-44b4-84fd-aba06527c235",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import yaml\n",
+    "from getpass import getpass\n",
+    "\n",
+    "from navigator_helpers import PiiDocsPipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8f0b52fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_config = \"\"\"\n",
+    "- model_name: gretelai-mistral-nemo-2407\n",
+    "  litellm_params:\n",
+    "    model: gretelai/gpt-mistral-nemo-2407\n",
+    "    api_key: os.environ/GRETEL_PROD_API_KEY\n",
+    "    api_base: https://api.gretel.ai\n",
+    "  tags:\n",
+    "  - open_license\n",
+    "  - nl\n",
+    "  - judge\n",
+    "  - code\n",
+    "\"\"\"\n",
+    "\n",
+    "# Save the configuration to a local YAML file\n",
+    "llm_config_path = \"llm_config.yaml\"\n",
+    "with open(llm_config_path, \"w\") as file:\n",
+    "    yaml.dump(yaml.safe_load(llm_config), file, default_flow_style=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1cb9c7c2-f28e-4c0b-87fa-871a1debda84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pii_doc_config = \"\"\"\n",
+    "doc_lang: pii_doc\n",
+    "llm_suite_type: open_license\n",
+    "\n",
+    "num_domains: 3\n",
+    "num_doctypes_per_domain: 10\n",
+    "\n",
+    "entity_validation: true\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f37e952e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the Gretel API key as an environment variable\n",
+    "os.environ[\"GRETEL_PROD_API_KEY\"] = getpass(\"Enter your Gretel API key: \")\n",
+    "\n",
+    "pipe = PiiDocsPipeline(pii_doc_config, llm_config=llm_config_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "18a77c21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "contextual_tags = {\n",
+    "    \"domain_and_doctypes\": {\n",
+    "        \"healthcare\": None,\n",
+    "        \"e-commerce\": None,\n",
+    "        \"education\": None\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a13092",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.set_contextual_tags(contextual_tags)\n",
+    "pipe.show_contextual_tags()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7384e462",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = pipe.run(num_samples=5, max_workers=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05b101d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.display_sample()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/navigator_helpers/__init__.py b/navigator_helpers/__init__.py
@@ -17,6 +17,7 @@
 from .llms.llm_suite import GretelLLMSuite
 from .pipelines.config.utils import smart_load_pipeline_config
 from .pipelines.text_to_code import NL2CodePipeline
+from .pipelines.pii_documents import PiiDocsPipeline
 from .tasks.text_to_code.task_suite import NL2PythonTaskSuite, NL2SQLTaskSuite
 from .text_inference import TextInference
 

diff --git a/navigator_helpers/pipelines/config/base.py b/navigator_helpers/pipelines/config/base.py
@@ -14,7 +14,12 @@ class PipelineConfig(BaseModel):
     def validate_artifact_path(self):
         if self.artifact_path is not None:
             self.artifact_path = Path(self.artifact_path)
-            self.artifact_path = self.artifact_path / f"{self.code_lang.value}"
+            if hasattr(self, 'code_lang'):
+                self.artifact_path = self.artifact_path / self.code_lang.value
+            elif hasattr(self, 'doc_lang'):
+                self.artifact_path = self.artifact_path / self.doc_lang.value
+            else:
+                self.artifact_path = self.artifact_path / "default"
             self.artifact_path.mkdir(parents=True, exist_ok=True)
         return self
 

diff --git a/navigator_helpers/pipelines/config/pii_documents.py b/navigator_helpers/pipelines/config/pii_documents.py
@@ -0,0 +1,17 @@
+from pydantic import BaseModel
+
+from navigator_helpers.pipelines.config.base import PipelineConfig
+from navigator_helpers.tasks.pii_documents.task_suite import DocLang
+
+
+class PiiDocsAutoConfig(PipelineConfig, BaseModel):
+    doc_lang: DocLang = DocLang.PII_DOC
+    num_domains: int = 10
+    num_doctypes_per_domain: int = 10
+    entity_validation: bool = True
+
+
+class PiiDocsManualConfig(PipelineConfig, BaseModel):
+    doc_lang: DocLang = DocLang.PII_DOC
+    domain_and_doctypes: dict[str, list[str]]
+    entity_validation: bool = True
diff --git a/navigator_helpers/pipelines/config/utils.py b/navigator_helpers/pipelines/config/utils.py
@@ -5,14 +5,30 @@
     NL2CodeAutoConfig,
     NL2CodeManualConfig,
 )
+from navigator_helpers.pipelines.config.pii_documents import (
+    PiiDocsAutoConfig,
+    PiiDocsManualConfig,
+)
+
 
+# Mapping of conditions to their corresponding config classes
+CONFIG_MAP = {
+    ("num_domains", "num_doctypes_per_domain"): PiiDocsAutoConfig,
+    ("num_topics_per_domain", "num_complexity_levels"): NL2CodeAutoConfig,
+    ("domain_and_doctypes",): PiiDocsManualConfig,
+}
 
 def smart_load_pipeline_config(config: ConfigLike) -> PipelineConfig:
-    if not isinstance(config, (NL2CodeManualConfig, NL2CodeAutoConfig)):
+    if not isinstance(config, (NL2CodeManualConfig, NL2CodeAutoConfig, PiiDocsManualConfig, PiiDocsAutoConfig)):
         config = smart_load_yaml(config)
-        config = (
-            NL2CodeAutoConfig(**config)
-            if "num_domains" in config
-            else NL2CodeManualConfig(**config)
-        )
+        config_class = get_config_class(config)
+        return config_class(**config)
+
     return config
+
+def get_config_class(config: dict) -> type[PipelineConfig]:
+    """Determine the appropriate config class based on the keys in the config dictionary."""
+    for keys, config_class in CONFIG_MAP.items():
+        if all(key in config for key in keys):
+            return config_class
+    return NL2CodeManualConfig  # Default config class if no match is found