Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding additional module for creating PII / PHI documents #40

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/piidoc_pipeline/configs/auto_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
code_lang: sql
llm_suite_type: open_license

num_domains: 10
num_topics_per_domain: 10
num_complexity_levels: 4

llm_as_a_judge: true
syntax_validation: true
11 changes: 11 additions & 0 deletions examples/piidoc_pipeline/configs/llms/dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
- model_name: gretelai/gpt-mistral-nemo-2407
litellm_params:
model: openai/gretelai/gpt-mistral-nemo-2407
base_url: https://llmproxy.dev.gretel.cloud/v1/
extra_headers:
Authorization: os.environ/GRETEL_DEV_API_KEY
tags:
- open_license
- nl
- code
- judge
9 changes: 9 additions & 0 deletions examples/piidoc_pipeline/configs/llms/local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- model_name: local
litellm_params:
model: openai/local
base_url: http://localhost:8080/v1/
tags:
- open_license
- nl
- code
- judge
10 changes: 10 additions & 0 deletions examples/piidoc_pipeline/configs/llms/prod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
- model_name: gretelai-mistral-nemo-2407
litellm_params:
model: gretelai/gpt-mistral-nemo-2407
api_key: os.environ/GRETEL_PROD_API_KEY
api_base: https://api.gretel.ai
tags:
- open_license
- nl
- code
- judge
25 changes: 25 additions & 0 deletions examples/piidoc_pipeline/configs/manual_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
doc_lang: pii_doc
llm_suite_type: open_license


domain_and_doctypes:
Healthcare:
- Appointment Scheduling
- Patient Consent Form
- Medical History Form
- Insurance Claim
- Medical Bill Statement
Legal Documents:
- Contract
- Contract
- Agreement
- Service Level Agreement
- Non-Disclosure Agreement
Financial Services:
- Loan Application Form
- Credit Report
- Insurance Policy
- Bank Statement
- Investment Statement

entity_validation: true
10 changes: 10 additions & 0 deletions examples/piidoc_pipeline/llm_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
- litellm_params:
api_base: https://api.gretel.ai
api_key: os.environ/GRETEL_PROD_API_KEY
model: gretelai/gpt-mistral-nemo-2407
model_name: gretelai-mistral-nemo-2407
tags:
- open_license
- nl
- judge
- code
143 changes: 143 additions & 0 deletions examples/piidoc_pipeline/piidocs_generation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "8e43db24-efe0-44b4-84fd-aba06527c235",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import yaml\n",
"from getpass import getpass\n",
"\n",
"from navigator_helpers import PiiDocsPipeline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8f0b52fe",
"metadata": {},
"outputs": [],
"source": [
"llm_config = \"\"\"\n",
"- model_name: gretelai-mistral-nemo-2407\n",
" litellm_params:\n",
" model: gretelai/gpt-mistral-nemo-2407\n",
" api_key: os.environ/GRETEL_PROD_API_KEY\n",
" api_base: https://api.gretel.ai\n",
" tags:\n",
" - open_license\n",
" - nl\n",
" - judge\n",
" - code\n",
"\"\"\"\n",
"\n",
"# Save the configuration to a local YAML file\n",
"llm_config_path = \"llm_config.yaml\"\n",
"with open(llm_config_path, \"w\") as file:\n",
" yaml.dump(yaml.safe_load(llm_config), file, default_flow_style=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1cb9c7c2-f28e-4c0b-87fa-871a1debda84",
"metadata": {},
"outputs": [],
"source": [
"pii_doc_config = \"\"\"\n",
"doc_lang: pii_doc\n",
"llm_suite_type: open_license\n",
"\n",
"num_domains: 3\n",
"num_doctypes_per_domain: 10\n",
"\n",
"entity_validation: true\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f37e952e",
"metadata": {},
"outputs": [],
"source": [
"# Set the Gretel API key as an environment variable\n",
"os.environ[\"GRETEL_PROD_API_KEY\"] = getpass(\"Enter your Gretel API key: \")\n",
"\n",
"pipe = PiiDocsPipeline(pii_doc_config, llm_config=llm_config_path)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "18a77c21",
"metadata": {},
"outputs": [],
"source": [
"contextual_tags = {\n",
" \"domain_and_doctypes\": {\n",
" \"healthcare\": None,\n",
" \"e-commerce\": None,\n",
" \"education\": None\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44a13092",
"metadata": {},
"outputs": [],
"source": [
"pipe.set_contextual_tags(contextual_tags)\n",
"pipe.show_contextual_tags()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7384e462",
"metadata": {},
"outputs": [],
"source": [
"results = pipe.run(num_samples=5, max_workers=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05b101d2",
"metadata": {},
"outputs": [],
"source": [
"results.display_sample()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "venv"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions navigator_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .llms.llm_suite import GretelLLMSuite
from .pipelines.config.utils import smart_load_pipeline_config
from .pipelines.text_to_code import NL2CodePipeline
from .pipelines.pii_documents import PiiDocsPipeline
from .tasks.text_to_code.task_suite import NL2PythonTaskSuite, NL2SQLTaskSuite
from .text_inference import TextInference

Expand Down
7 changes: 6 additions & 1 deletion navigator_helpers/pipelines/config/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ class PipelineConfig(BaseModel):
def validate_artifact_path(self):
if self.artifact_path is not None:
self.artifact_path = Path(self.artifact_path)
self.artifact_path = self.artifact_path / f"{self.code_lang.value}"
if hasattr(self, 'code_lang'):
self.artifact_path = self.artifact_path / self.code_lang.value
elif hasattr(self, 'doc_lang'):
self.artifact_path = self.artifact_path / self.doc_lang.value
else:
self.artifact_path = self.artifact_path / "default"
self.artifact_path.mkdir(parents=True, exist_ok=True)
return self

Expand Down
17 changes: 17 additions & 0 deletions navigator_helpers/pipelines/config/pii_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pydantic import BaseModel

from navigator_helpers.pipelines.config.base import PipelineConfig
from navigator_helpers.tasks.pii_documents.task_suite import DocLang


class PiiDocsAutoConfig(PipelineConfig, BaseModel):
doc_lang: DocLang = DocLang.PII_DOC
num_domains: int = 10
num_doctypes_per_domain: int = 10
entity_validation: bool = True


class PiiDocsManualConfig(PipelineConfig, BaseModel):
doc_lang: DocLang = DocLang.PII_DOC
domain_and_doctypes: dict[str, list[str]]
entity_validation: bool = True
28 changes: 22 additions & 6 deletions navigator_helpers/pipelines/config/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,30 @@
NL2CodeAutoConfig,
NL2CodeManualConfig,
)
from navigator_helpers.pipelines.config.pii_documents import (
PiiDocsAutoConfig,
PiiDocsManualConfig,
)


# Mapping of conditions to their corresponding config classes
CONFIG_MAP = {
("num_domains", "num_doctypes_per_domain"): PiiDocsAutoConfig,
("num_topics_per_domain", "num_complexity_levels"): NL2CodeAutoConfig,
("domain_and_doctypes",): PiiDocsManualConfig,
}

def smart_load_pipeline_config(config: ConfigLike) -> PipelineConfig:
if not isinstance(config, (NL2CodeManualConfig, NL2CodeAutoConfig)):
if not isinstance(config, (NL2CodeManualConfig, NL2CodeAutoConfig, PiiDocsManualConfig, PiiDocsAutoConfig)):
config = smart_load_yaml(config)
config = (
NL2CodeAutoConfig(**config)
if "num_domains" in config
else NL2CodeManualConfig(**config)
)
config_class = get_config_class(config)
return config_class(**config)

return config

def get_config_class(config: dict) -> type[PipelineConfig]:
"""Determine the appropriate config class based on the keys in the config dictionary."""
for keys, config_class in CONFIG_MAP.items():
if all(key in config for key in keys):
return config_class
return NL2CodeManualConfig # Default config class if no match is found
Loading