From 3d92dcf5e8932a1073e7d2a5ed5119a931752572 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 12 Nov 2024 18:46:50 +0000 Subject: [PATCH 1/2] update python blueprint --- ...tor-data-designer-sdk-text-to-python.ipynb | 63 +++++++++---------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-python.ipynb b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-python.ipynb index 99fcde21..60fe4097 100644 --- a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-python.ipynb +++ b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-python.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "mNoaC7dX28y0" }, @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "1k5NjjtzPQJi" }, @@ -81,7 +81,7 @@ "\n", "special_system_instructions: >-\n", " You are an expert at writing, analyzing, and editing Python code. You know what\n", - " a high-quality, clean, efficient, and maintainable Python code looks like. You\n", + " high-quality, clean, efficient, and maintainable Python code looks like. You\n", " excel at transforming natural language into Python, as well as Python back into\n", " natural language. Your job is to assist the user with their Python-related tasks.\n", "\n", @@ -105,47 +105,41 @@ " - DevOps and Continuous Integration/Continuous Deployment (CI/CD) Tools\n", "\n", " - name: code_complexity\n", - " values: [Intermediate, Advanced, Expert]\n", + " values: [Beginner, Intermediate, Advanced]\n", " subcategories:\n", " - name: code_concept\n", " values:\n", - " Intermediate: [Functions, List Comprehensions, Classes]\n", - " Advanced: [Object-oriented programming, Error Handling, Lambda Functions]\n", - " Expert: [Decorators, Multithreading, Context Managers]\n", - "\n", - " - name: prompt_type\n", - " values: [instruction, question]\n", - " subcategories:\n", - " - name: prompt_creation_instruction\n", - " values:\n", - " instruction:\n", - " - Write an instruction for a user to write Python code for a specific task.\n", - " - Generate a clear and concise instruction for a Python programming challenge.\n", - " question:\n", - " - Ask a specific question about how to solve a problem using Python code.\n", - " - Generate a question about how to perform a general task in Python.\n", + " Beginner: [Variables, Data Types, Functions, Loops, Classes]\n", + " Intermediate: [List Comprehensions, Object-oriented programming, Lambda Functions, Web frameworks, Pandas]\n", + " Advanced: [Multithreading, Context Managers, Generators]\n", + " \n", + " - name: instruction_phrase\n", + " values: [\"Write a function that\", \"Create a class that\", \"Implement a script\", \"Can you create a function\", \"Develop a module that\"]\n", "\n", "generated_data_columns:\n", - " - name: text\n", + " - name: instruction\n", " generation_prompt: >-\n", - " {prompt_creation_instruction} \\n\n", - "\n", - " ### Important Guidelines ###\n", - " * Make sure the {prompt_type} is related to {topic} in the {industry_sector} sector.\n", - " * Do not write any code as part of the {prompt_type}.\n", + " Generate an instruction to create Python code that solves a specific problem. Each instruction should begin with one of the following phrases: {instruction_phrase}.\n", + " \n", + " Important Guidelines:\n", + " * Industry Relevance: Ensure the instruction pertains to the {industry_sector} sector and {topic} topic.\n", + " * Code Complexity: Tailor the instruction to the {code_complexity} level. Utilize relevant {code_concept} where appropriate to match the complexity level.\n", + " * Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n", + " * Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n", " columns_to_list_in_prompt: all_categorical_seed_columns\n", "\n", " - name: code\n", " generation_prompt: >-\n", - " Write Python code that will be paired with the following prompt:\n", - " {text} \\n\n", + " Write Python code for the following instruction:\n", + " Instruction: {instruction}\\n\n", "\n", - " ### Important Guidelines ###\n", - " * Your code should be self-contained and executable.\n", - " * Remember to import any necessary libraries.\n", - " * The code should be written at a {code_complexity} level and make use of {code_concept}.\n", + " Important Guidelines:\n", + " * Code Quality: Your code should be clean, complete, self-contained and accurate.\n", + " * Code Validity: Please ensure that your python code is executable and does not contain any errors.\n", + " * Packages: Remember to import any necessary libraries, and to use all libraries you import.\n", + " * Complexity & Concepts: The code should be written at a {code_complexity} level, making use use of concepts such as {code_concept}.\n", " llm_type: code\n", - " columns_to_list_in_prompt: [industry_sector, topic]\n", + " columns_to_list_in_prompt: [topic]\n", "\n", "post_processors:\n", " - validator: code\n", @@ -155,7 +149,7 @@ "\n", " - evaluator: text_to_python\n", " settings:\n", - " text_column: text\n", + " text_column: instruction\n", " code_column: code\n", "\"\"\"\n", "\n", @@ -288,6 +282,7 @@ }, "kernelspec": { "display_name": "Python 3", + "language": "python", "name": "python3" }, "language_info": { @@ -300,7 +295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.10.12" } }, "nbformat": 4, From 027e6420e9278378d3a57de4e7254d767f4e73e8 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 19 Nov 2024 02:10:39 +0000 Subject: [PATCH 2/2] update sql blueprint --- ...igator-data-designer-sdk-text-to-sql.ipynb | 125 ++++++++++-------- 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb index 379ecbd6..054e8328 100644 --- a/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb +++ b/docs/notebooks/demo/navigator/navigator-data-designer-sdk-text-to-sql.ipynb @@ -72,44 +72,47 @@ "outputs": [], "source": [ "config = \"\"\"\n", - "model_suite: llama-3.x\n", + "model_suite: apache-2.0\n", "\n", "special_system_instructions: >-\n", " You are an expert at writing, analyzing and editing SQL queries. You know what\n", " a high-quality, clean, efficient, and maintainable SQL code looks like. You\n", " excel at transforming natural language into SQL, as well as SQL back into\n", " natural language. Your job is to assist the user with their SQL-related tasks.\n", - " Leverage T-SQL only.\n", "\n", "categorical_seed_columns:\n", - " - name: domain\n", - " description: Major industry domain or sector that relies on robust data solutions\n", - " values: [Healthcare, Finance, Education, Science and Technology, Environmental Science, Government]\n", - " num_new_values_to_generate: 5\n", + " - name: industry_sector\n", + " values: [Healthcare, Finance, Technology]\n", " subcategories:\n", - " - name: domain_description\n", - " description: High-level description of the domain, highlighting various types of data relevant to writing SQL\n", - " num_new_values_to_generate: 1\n", " - name: topic\n", - " description: Key topics that professional SQL developers care about in the given domain\n", - " num_new_values_to_generate: 15\n", + " values:\n", + " Healthcare:\n", + " - Electronic Health Records (EHR) Systems\n", + " - Telemedicine Platforms\n", + " - AI-Powered Diagnostic Tools\n", + " Finance:\n", + " - Fraud Detection Software\n", + " - Automated Trading Systems\n", + " - Personal Finance Apps\n", + " Technology:\n", + " - Cloud Computing Platforms\n", + " - Artificial Intelligence and Machine Learning Platforms\n", + " - DevOps and Continuous Integration/Continuous Deployment (CI/CD) Tools\n", "\n", " - name: sql_complexity\n", - " description: Complexity of the SQL query, ranging from basic operations to advanced data processing techniques\n", - " values:\n", - " - \"Basic SQL\"\n", - " - \"Aggregation\"\n", - " - \"Single Join\"\n", - " - \"Subquery\"\n", - " - \"Multiple Join\"\n", - " - \"Window Functions\"\n", + " values: [Beginner, Intermediate, Advanced]\n", " subcategories:\n", + " - name: sql_concept\n", + " values:\n", + " Beginner: [\"Basic SQL\", \"SELECT Statements\", \"WHERE Clauses\", \"Basic JOINs\", \"INSERT, UPDATE, DELETE\"]\n", + " Intermediate: [\"Aggregation\", \"Single JOIN\", \"Subquery\", \"Views\", \"Stored Procedures\"]\n", + " Advanced: [\"Multiple JOINs\", \"Window Functions\", \"Common Table Expressions (CTEs)\", \"Triggers\", \"Query Optimization\"]\n", " - name: sql_complexity_description\n", - " description: Description of the complexity level of the SQL query\n", + " generation_prompt: >-\n", + " Write a description of the complexity level of the given SQL complexity and SQL concept.\n", " num_new_values_to_generate: 1\n", "\n", " - name: sql_task_type\n", - " description: Type of SQL task that the query represents\n", " values:\n", " - \"Data Retrieval\"\n", " - \"Data Definition\"\n", @@ -119,51 +122,69 @@ " - \"Data Cleaning and Transformation\"\n", " subcategories:\n", " - name: sql_task_type_description\n", - " description: Description of the type of SQL task\n", + " generation_prompt: >-\n", + " Provide a detailed description of the SQL task type: {sql_task_type}.\n", " num_new_values_to_generate: 1\n", "\n", + " - name: instruction_phrase\n", + " values:\n", + " - \"Construct an SQL query to\"\n", + " - \"Formulate an SQL statement that\"\n", + " - \"Implement an SQL view that\"\n", + "\n", "generated_data_columns:\n", " - name: sql_prompt\n", " generation_prompt: >-\n", - " Create a natural language prompt to generate SQL in the field of {domain},\n", - " specifically about the topic of {topic}. Feel free to ask for data that\n", - " focus on a smaller subject within the scope of {domain_description}.\n", - " columns_to_list_in_prompt: all_categorical_seed_columns\n", - " llm_type: natural_language\n", + " Generate a clear and specific natural language instruction for creating an SQL query tailored to the {industry_sector} sector, focusing on the {topic} topic and the {sql_task_type} task. \n", + " Each instruction should begin with one of the following phrases: \"{instruction_phrase}\".\n", + " \n", + " Important Guidelines:\n", + " * Industry Relevance: Ensure the instruction is directly related to the {industry_sector} sector and the {topic} topic.\n", + " * Task Specificity: Clearly define the SQL task type ({sql_task_type}) to provide focused and actionable requirements.\n", + " * Complexity Alignment: Align the instruction with the appropriate SQL complexity level by implicitly incorporating relevant SQL concepts.\n", + " * Clarity and Precision: Craft the instruction to be unambiguous and straightforward, providing all necessary context without unnecessary verbosity.\n", + " * Response Formatting: Exclude any markers or similar formatting cues in the instruction.\n", + " columns_to_list_in_prompt: [industry_sector, topic, sql_task_type, instruction_phrase]\n", "\n", " - name: sql_context\n", " generation_prompt: >-\n", - " Write a SQL query that generates tables and views in a database and are\n", - " pertinent to the natural language prompt in {sql_prompt}.\n", - "\n", - " Include complete executable SQL table CREATE statements and/or view CREATE statements.\n", - " Provide up to five tables/views that are relevant to the user's natural language prompt.\n", - " Table names and schemas should correspond to the {domain} domain and focus on {domain_description}\n", - " columns_to_list_in_prompt: [domain, domain_description, topic, sql_prompt]\n", + " Generate a set of database tables and views that are pertinent to the SQL instruction in {sql_prompt} and the task type {sql_task_type} within the {industry_sector} sector and {topic} topic.\n", + " \n", + " Important Guidelines:\n", + " * Relevance: Ensure that all generated tables and views are directly related to the {industry_sector} sector and the {topic} topic. They should provide the necessary structure to support the SQL instruction effectively.\n", + " * Completeness: Include all essential columns with appropriate data types, primary keys, foreign keys, and necessary constraints to accurately represent real-world database schemas.\n", + " * Realism: Design realistic and practical table schemas that reflect typical structures used in the specified industry sector. Avoid overly simplistic or excessively complex schemas unless required by the task.\n", + " * Executable SQL: Provide complete and executable statements. Ensure that there are no syntax errors and that the statements can be run without modification.\n", + " * Consistency: Maintain consistent naming conventions for tables and columns, adhering to best practices (e.g., snake_case for table and column names).\n", + " * Response Formatting: Exclude any markers or similar formatting cues in the instruction.\n", + " columns_to_list_in_prompt: [industry_sector, topic, sql_prompt, sql_task_type]\n", " llm_type: code\n", - "\n", + " \n", " - name: sql\n", " generation_prompt: >-\n", - " Write an SQL query to answer/execute the natural language prompt in\n", - " {sql_prompt}.\n", - "\n", - " SQL should be based on the database context generated in {sql_context}.\n", - " SQL should leverage {sql_complexity}.\n", - " columns_to_list_in_prompt: [domain, topic, sql_complexity, sql_task_type]\n", + " Write an SQL query to answer/execute the following instruction and sql context.\n", + " Instruction: {sql_prompt}\\n\n", + " Context: {sql_context}\\n\n", + " \n", + " Important Guidelines:\n", + " * SQL Quality: Write self-contained and modular SQL code.\n", + " * SQL Validity: Please ensure that your SQL code is executable and does not contain any errors.\n", + " * Context: Base the SQL query on the provided database context in \"{sql_context}\". Ensure that all referenced tables, views, and columns exist within this context.\n", + " * Complexity & Concepts: The SQL should be written at a {sql_complexity} level, making use use of concepts such as {sql_context}.\n", + " columns_to_list_in_prompt: [sql_prompt, sql_context, sql_complexity]\n", " llm_type: code\n", "\n", - "\n", "post_processors:\n", - " - validator: code\n", - " settings:\n", - " code_lang: tsql\n", - " code_columns: [sql_context, sql]\n", - "\n", - " - evaluator: text_to_sql\n", - " settings:\n", - " text_column: sql_prompt\n", - " code_column: sql\n", - " context_column: sql_context\n", + " - validator: code\n", + " settings:\n", + " code_lang: ansi\n", + " code_columns: [sql_context, sql]\n", + " \n", + " - evaluator: text_to_sql\n", + " settings:\n", + " text_column: sql_prompt\n", + " code_column: sql\n", + " context_column: sql_context\n", "\"\"\"\n", "\n", "data_designer = DataDesigner.from_config(config, **session_kwargs)"