From 3162013c51dfc96592c17fc26e4df4f295739573 Mon Sep 17 00:00:00 2001 From: dhruvnathawani <128275431+dhruvnathawani@users.noreply.github.com> Date: Wed, 20 Nov 2024 08:38:23 -0800 Subject: [PATCH] Update Navigator data designer Blueprints for M1 (#465) * update python blueprint * update sql blueprint --- ...igator-data-designer-sdk-text-to-sql.ipynb | 125 ++++++++++-------- 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb index 379ecbd6..054e8328 100644 --- a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb +++ b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb @@ -72,44 +72,47 @@ "outputs": [], "source": [ "config = \"\"\"\n", - "model_suite: llama-3.x\n", + "model_suite: apache-2.0\n", "\n", "special_system_instructions: >-\n", " You are an expert at writing, analyzing and editing SQL queries. You know what\n", " a high-quality, clean, efficient, and maintainable SQL code looks like. You\n", " excel at transforming natural language into SQL, as well as SQL back into\n", " natural language. Your job is to assist the user with their SQL-related tasks.\n", - " Leverage T-SQL only.\n", "\n", "categorical_seed_columns:\n", - " - name: domain\n", - " description: Major industry domain or sector that relies on robust data solutions\n", - " values: [Healthcare, Finance, Education, Science and Technology, Environmental Science, Government]\n", - " num_new_values_to_generate: 5\n", + " - name: industry_sector\n", + " values: [Healthcare, Finance, Technology]\n", " subcategories:\n", - " - name: domain_description\n", - " description: High-level description of the domain, highlighting various types of data relevant to writing SQL\n", - " num_new_values_to_generate: 1\n", " - name: topic\n", - " description: Key topics that professional SQL developers care about in the given domain\n", - " num_new_values_to_generate: 15\n", + " values:\n", + " Healthcare:\n", + " - Electronic Health Records (EHR) Systems\n", + " - Telemedicine Platforms\n", + " - AI-Powered Diagnostic Tools\n", + " Finance:\n", + " - Fraud Detection Software\n", + " - Automated Trading Systems\n", + " - Personal Finance Apps\n", + " Technology:\n", + " - Cloud Computing Platforms\n", + " - Artificial Intelligence and Machine Learning Platforms\n", + " - DevOps and Continuous Integration/Continuous Deployment (CI/CD) Tools\n", "\n", " - name: sql_complexity\n", - " description: Complexity of the SQL query, ranging from basic operations to advanced data processing techniques\n", - " values:\n", - " - \"Basic SQL\"\n", - " - \"Aggregation\"\n", - " - \"Single Join\"\n", - " - \"Subquery\"\n", - " - \"Multiple Join\"\n", - " - \"Window Functions\"\n", + " values: [Beginner, Intermediate, Advanced]\n", " subcategories:\n", + " - name: sql_concept\n", + " values:\n", + " Beginner: [\"Basic SQL\", \"SELECT Statements\", \"WHERE Clauses\", \"Basic JOINs\", \"INSERT, UPDATE, DELETE\"]\n", + " Intermediate: [\"Aggregation\", \"Single JOIN\", \"Subquery\", \"Views\", \"Stored Procedures\"]\n", + " Advanced: [\"Multiple JOINs\", \"Window Functions\", \"Common Table Expressions (CTEs)\", \"Triggers\", \"Query Optimization\"]\n", " - name: sql_complexity_description\n", - " description: Description of the complexity level of the SQL query\n", + " generation_prompt: >-\n", + " Write a description of the complexity level of the given SQL complexity and SQL concept.\n", " num_new_values_to_generate: 1\n", "\n", " - name: sql_task_type\n", - " description: Type of SQL task that the query represents\n", " values:\n", " - \"Data Retrieval\"\n", " - \"Data Definition\"\n", @@ -119,51 +122,69 @@ " - \"Data Cleaning and Transformation\"\n", " subcategories:\n", " - name: sql_task_type_description\n", - " description: Description of the type of SQL task\n", + " generation_prompt: >-\n", + " Provide a detailed description of the SQL task type: {sql_task_type}.\n", " num_new_values_to_generate: 1\n", "\n", + " - name: instruction_phrase\n", + " values:\n", + " - \"Construct an SQL query to\"\n", + " - \"Formulate an SQL statement that\"\n", + " - \"Implement an SQL view that\"\n", + "\n", "generated_data_columns:\n", " - name: sql_prompt\n", " generation_prompt: >-\n", - " Create a natural language prompt to generate SQL in the field of {domain},\n", - " specifically about the topic of {topic}. Feel free to ask for data that\n", - " focus on a smaller subject within the scope of {domain_description}.\n", - " columns_to_list_in_prompt: all_categorical_seed_columns\n", - " llm_type: natural_language\n", + " Generate a clear and specific natural language instruction for creating an SQL query tailored to the {industry_sector} sector, focusing on the {topic} topic and the {sql_task_type} task. \n", + " Each instruction should begin with one of the following phrases: \"{instruction_phrase}\".\n", + " \n", + " Important Guidelines:\n", + " * Industry Relevance: Ensure the instruction is directly related to the {industry_sector} sector and the {topic} topic.\n", + " * Task Specificity: Clearly define the SQL task type ({sql_task_type}) to provide focused and actionable requirements.\n", + " * Complexity Alignment: Align the instruction with the appropriate SQL complexity level by implicitly incorporating relevant SQL concepts.\n", + " * Clarity and Precision: Craft the instruction to be unambiguous and straightforward, providing all necessary context without unnecessary verbosity.\n", + " * Response Formatting: Exclude any markers or similar formatting cues in the instruction.\n", + " columns_to_list_in_prompt: [industry_sector, topic, sql_task_type, instruction_phrase]\n", "\n", " - name: sql_context\n", " generation_prompt: >-\n", - " Write a SQL query that generates tables and views in a database and are\n", - " pertinent to the natural language prompt in {sql_prompt}.\n", - "\n", - " Include complete executable SQL table CREATE statements and/or view CREATE statements.\n", - " Provide up to five tables/views that are relevant to the user's natural language prompt.\n", - " Table names and schemas should correspond to the {domain} domain and focus on {domain_description}\n", - " columns_to_list_in_prompt: [domain, domain_description, topic, sql_prompt]\n", + " Generate a set of database tables and views that are pertinent to the SQL instruction in {sql_prompt} and the task type {sql_task_type} within the {industry_sector} sector and {topic} topic.\n", + " \n", + " Important Guidelines:\n", + " * Relevance: Ensure that all generated tables and views are directly related to the {industry_sector} sector and the {topic} topic. They should provide the necessary structure to support the SQL instruction effectively.\n", + " * Completeness: Include all essential columns with appropriate data types, primary keys, foreign keys, and necessary constraints to accurately represent real-world database schemas.\n", + " * Realism: Design realistic and practical table schemas that reflect typical structures used in the specified industry sector. Avoid overly simplistic or excessively complex schemas unless required by the task.\n", + " * Executable SQL: Provide complete and executable statements. Ensure that there are no syntax errors and that the statements can be run without modification.\n", + " * Consistency: Maintain consistent naming conventions for tables and columns, adhering to best practices (e.g., snake_case for table and column names).\n", + " * Response Formatting: Exclude any markers or similar formatting cues in the instruction.\n", + " columns_to_list_in_prompt: [industry_sector, topic, sql_prompt, sql_task_type]\n", " llm_type: code\n", - "\n", + " \n", " - name: sql\n", " generation_prompt: >-\n", - " Write an SQL query to answer/execute the natural language prompt in\n", - " {sql_prompt}.\n", - "\n", - " SQL should be based on the database context generated in {sql_context}.\n", - " SQL should leverage {sql_complexity}.\n", - " columns_to_list_in_prompt: [domain, topic, sql_complexity, sql_task_type]\n", + " Write an SQL query to answer/execute the following instruction and sql context.\n", + " Instruction: {sql_prompt}\\n\n", + " Context: {sql_context}\\n\n", + " \n", + " Important Guidelines:\n", + " * SQL Quality: Write self-contained and modular SQL code.\n", + " * SQL Validity: Please ensure that your SQL code is executable and does not contain any errors.\n", + " * Context: Base the SQL query on the provided database context in \"{sql_context}\". Ensure that all referenced tables, views, and columns exist within this context.\n", + " * Complexity & Concepts: The SQL should be written at a {sql_complexity} level, making use use of concepts such as {sql_context}.\n", + " columns_to_list_in_prompt: [sql_prompt, sql_context, sql_complexity]\n", " llm_type: code\n", "\n", - "\n", "post_processors:\n", - " - validator: code\n", - " settings:\n", - " code_lang: tsql\n", - " code_columns: [sql_context, sql]\n", - "\n", - " - evaluator: text_to_sql\n", - " settings:\n", - " text_column: sql_prompt\n", - " code_column: sql\n", - " context_column: sql_context\n", + " - validator: code\n", + " settings:\n", + " code_lang: ansi\n", + " code_columns: [sql_context, sql]\n", + " \n", + " - evaluator: text_to_sql\n", + " settings:\n", + " text_column: sql_prompt\n", + " code_column: sql\n", + " context_column: sql_context\n", "\"\"\"\n", "\n", "data_designer = DataDesigner.from_config(config, **session_kwargs)"