diff --git a/docs/Orchestration/Orchestration.md b/docs/Orchestration/Orchestration.md index 2ca12c16f0..339ee59afe 100644 --- a/docs/Orchestration/Orchestration.md +++ b/docs/Orchestration/Orchestration.md @@ -1,6 +1,7 @@ --- title: Orchestration id: Orchestration +sidebar_class_name: hidden description: Airflow and Databricks Jobs tags: - jobs diff --git a/docs/Orchestration/_category_.json b/docs/Orchestration/_category_.json deleted file mode 100644 index 10d69d18d4..0000000000 --- a/docs/Orchestration/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Orchestration", - "position": 8, - "collapsible": true, - "collapsed": true -} diff --git a/docs/tutorials/Spark/_category_.json b/docs/Orchestration/airflow/_category_.json similarity index 68% rename from docs/tutorials/Spark/_category_.json rename to docs/Orchestration/airflow/_category_.json index 892ebfe00e..02601d80bc 100644 --- a/docs/tutorials/Spark/_category_.json +++ b/docs/Orchestration/airflow/_category_.json @@ -1,5 +1,5 @@ { - "label": "Spark tutorials", + "label": "Airflow", "position": 2, "collapsible": true, "collapsed": true diff --git a/docs/Orchestration/alternative-schedulers.md b/docs/Orchestration/alternative-schedulers.md index ee18ce4e8d..ae10d04c7c 100644 --- a/docs/Orchestration/alternative-schedulers.md +++ b/docs/Orchestration/alternative-schedulers.md @@ -1,6 +1,7 @@ --- title: Alternative Schedulers id: alternative-schedulers +sidebar_position: 3 description: Support for Alternative Orchestration Solutions tags: - jobs diff --git a/docs/Orchestration/databricks-jobs.md b/docs/Orchestration/databricks-jobs.md index 647653cdfb..01174b95d6 100644 --- a/docs/Orchestration/databricks-jobs.md +++ b/docs/Orchestration/databricks-jobs.md @@ -165,5 +165,5 @@ status of historic/current runs (success/failure/in-progress) for quick referenc ## Guides -1. [How to trigger a job from another job?](/tutorials/Orchestration/multi-jobs-trigger) -2. [How to design a reliable CI/CD process?](/tutorials/Orchestration/reliable-ci-cd) +1. [How to trigger a job from another job?](multi-jobs-trigger) +2. [How to design a reliable CI/CD process?](reliable-ci-cd) diff --git a/docs/tutorials/Orchestration/img/jobs-tigger-time-based.png b/docs/Orchestration/img/jobs-tigger-time-based.png similarity index 100% rename from docs/tutorials/Orchestration/img/jobs-tigger-time-based.png rename to docs/Orchestration/img/jobs-tigger-time-based.png diff --git a/docs/tutorials/Orchestration/img/jobs-tigger-trigger-based.png b/docs/Orchestration/img/jobs-tigger-trigger-based.png similarity index 100% rename from docs/tutorials/Orchestration/img/jobs-tigger-trigger-based.png rename to docs/Orchestration/img/jobs-tigger-trigger-based.png diff --git a/docs/tutorials/Orchestration/img/reliable-ci-cd/dev-qa-prod.png b/docs/Orchestration/img/reliable-ci-cd/dev-qa-prod.png similarity index 100% rename from docs/tutorials/Orchestration/img/reliable-ci-cd/dev-qa-prod.png rename to docs/Orchestration/img/reliable-ci-cd/dev-qa-prod.png diff --git a/docs/tutorials/Orchestration/img/reliable-ci-cd/md-fabrics.png b/docs/Orchestration/img/reliable-ci-cd/md-fabrics.png similarity index 100% rename from docs/tutorials/Orchestration/img/reliable-ci-cd/md-fabrics.png rename to docs/Orchestration/img/reliable-ci-cd/md-fabrics.png diff --git a/docs/tutorials/Orchestration/img/reliable-ci-cd/min-project-setup.png b/docs/Orchestration/img/reliable-ci-cd/min-project-setup.png similarity index 100% rename from docs/tutorials/Orchestration/img/reliable-ci-cd/min-project-setup.png rename to docs/Orchestration/img/reliable-ci-cd/min-project-setup.png diff --git a/docs/tutorials/Orchestration/img/reliable-ci-cd/prophecy-setup.png b/docs/Orchestration/img/reliable-ci-cd/prophecy-setup.png similarity index 100% rename from docs/tutorials/Orchestration/img/reliable-ci-cd/prophecy-setup.png rename to docs/Orchestration/img/reliable-ci-cd/prophecy-setup.png diff --git a/docs/tutorials/Orchestration/img/reliable-ci-cd/run-progress.png b/docs/Orchestration/img/reliable-ci-cd/run-progress.png similarity index 100% rename from docs/tutorials/Orchestration/img/reliable-ci-cd/run-progress.png rename to docs/Orchestration/img/reliable-ci-cd/run-progress.png diff --git a/docs/tutorials/Orchestration/multi-jobs-trigger.md b/docs/Orchestration/multi-jobs-trigger.md similarity index 88% rename from docs/tutorials/Orchestration/multi-jobs-trigger.md rename to docs/Orchestration/multi-jobs-trigger.md index ba35786a1a..8361019f1c 100644 --- a/docs/tutorials/Orchestration/multi-jobs-trigger.md +++ b/docs/Orchestration/multi-jobs-trigger.md @@ -2,7 +2,6 @@ title: Multi Jobs Trigger id: multi-jobs-trigger description: Complex pipeline interactions and timing -sidebar_position: 1 tags: - scheduling - jobs @@ -95,9 +94,3 @@ potential venue for the attacker. A better approach is to leverage Databricks se out [this guide](https://docs.databricks.com/security/secrets/secrets.html#create-a-secret-in-a-databricks-backed-scope) to learn how to create Databricks secrets. ::: - -
-
- -
- diff --git a/docs/tutorials/Orchestration/reliable-ci-cd.md b/docs/Orchestration/reliable-ci-cd.md similarity index 99% rename from docs/tutorials/Orchestration/reliable-ci-cd.md rename to docs/Orchestration/reliable-ci-cd.md index 995b94a164..64d82e1314 100644 --- a/docs/tutorials/Orchestration/reliable-ci-cd.md +++ b/docs/Orchestration/reliable-ci-cd.md @@ -3,7 +3,7 @@ title: Reliable CI/CD with Prophecy image: img/reliable-ci-cd/dev-qa-prod.png id: reliable-ci-cd description: Explore Continuous Integration and Continuous Delivery within Prophecy -sidebar_position: 2 +sidebar_position: 5 tags: - cicd - deployment diff --git a/docs/SQL/_category_.json b/docs/SQL/_category_.json deleted file mode 100644 index 3864b89e9f..0000000000 --- a/docs/SQL/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "SQL", - "position": 7, - "collapsible": true, - "collapsed": true -} diff --git a/docs/SQL/gems/transform/aggregate.md b/docs/SQL/gems/transform/aggregate.md index 81fdff1f6a..0baa8264e5 100644 --- a/docs/SQL/gems/transform/aggregate.md +++ b/docs/SQL/gems/transform/aggregate.md @@ -78,4 +78,4 @@ Using Config variables (and DBT Defined Configs) within a Gem is easy. Just wrap To learn more about the Aggregate Gem UI, see [this page](/docs/concepts/project/gems.md) which illustrates features common to all [Gems](/SQL/gems/gems.md). ::: -Here we used the Aggregate Gem from the HelloWorld_SQL Project as a learning guide. What types of Aggregations will you build? [Reach out](/docs/getting-started/getting-help/getting-help.md) with questions and to let us know how you're using Prophecy. +Here we used the Aggregate Gem from the HelloWorld_SQL Project as a learning guide. What types of Aggregations will you build? [Reach out](/docs/getting-help/getting-help.md) with questions and to let us know how you're using Prophecy. diff --git a/docs/SQL/sql.md b/docs/SQL/sql.md index 09fa42af27..0ce27c1fdf 100644 --- a/docs/SQL/sql.md +++ b/docs/SQL/sql.md @@ -2,6 +2,7 @@ title: Copilot for SQL users id: copilot-for-sql-users description: Using SQL with Prophecy's Data Transformation Copilot +sidebar_class_name: hidden tags: [sql, snowflake, databricks, warehouse] --- diff --git a/docs/Spark/Spark.md b/docs/Spark/Spark.md index cf5cf56de9..83e2671fad 100644 --- a/docs/Spark/Spark.md +++ b/docs/Spark/Spark.md @@ -2,6 +2,7 @@ title: Copilot for Spark users id: copilot-for-spark-users description: Using Spark with Prophecy's Data Transformation Copilot +sidebar_class_name: hidden tags: [spark, warehouse] --- diff --git a/docs/Spark/_category_.json b/docs/Spark/_category_.json deleted file mode 100644 index 85d07940cc..0000000000 --- a/docs/Spark/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Spark", - "position": 6, - "collapsible": true, - "collapsed": true -} diff --git a/docs/tutorials/Spark/img/xlsx_tgt_5.5.png b/docs/Spark/gems/source-target/file/img/xlsx_tgt_5.5.png similarity index 100% rename from docs/tutorials/Spark/img/xlsx_tgt_5.5.png rename to docs/Spark/gems/source-target/file/img/xlsx_tgt_5.5.png diff --git a/docs/tutorials/Spark/img/xlsx_tgt_6.png b/docs/Spark/gems/source-target/file/img/xlsx_tgt_6.png similarity index 100% rename from docs/tutorials/Spark/img/xlsx_tgt_6.png rename to docs/Spark/gems/source-target/file/img/xlsx_tgt_6.png diff --git a/docs/Spark/gems/source-target/file/xlsx.md b/docs/Spark/gems/source-target/file/xlsx.md index 71a9d559ac..5fd2baec21 100644 --- a/docs/Spark/gems/source-target/file/xlsx.md +++ b/docs/Spark/gems/source-target/file/xlsx.md @@ -67,7 +67,28 @@ The following is a list of options that are available while using XLSX as a **_T | Write Mode | Write mode, same as underlying Spark write mode | False | `"append"` | | Parition Columns | Columns to partition output files by | False | (empty) | -## Example output +## Writing a single output file + +When working with text-based files in Spark, your output isn't a single file but a directory containing multiple partitioned files due to Spark's distributed nature. + +For example, if you write to a location like **dbfs:/FileStore/Users/test/customers.xlsx**, you'll see the following in the DBFS: + +- A **customers.xlsx** directory. +- Partitions within the **customers.xlsx** directory. + +Each partition is a separate valid XLSX file with a segment of the overall output data. If you want to output only a single file, you'll need to: + +1. Add a Repartition Gem in **Coalesce** mode with the **Partition Count** set to `1`. + + ![Coalesce using Repartition](img/xlsx_tgt_5.5.png) + +2. Connect it between your second-to-last transformation and the `Target` Gem. + + ![Attach coalesce before desired target](img/xlsx_tgt_6.png) + +After running, your output will still be a directory, but this time it will only contain a single output file. + +## Example code Below is a snippet of the optimized code that is generated when using the XLSX source. diff --git a/docs/architecture/_category_.json b/docs/architecture/_category_.json deleted file mode 100644 index b73f12e808..0000000000 --- a/docs/architecture/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Architecture", - "position": 11, - "collapsible": true, - "collapsed": true -} diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 9fb4452be2..5d4ad477d8 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -1,6 +1,7 @@ --- title: Architecture id: architecture +sidebar_class_name: hidden description: Describing the architecture of Prophecy and how it can integrate into your use cases tags: [] --- diff --git a/docs/concepts/_category_.json b/docs/concepts/_category_.json deleted file mode 100644 index b337157b50..0000000000 --- a/docs/concepts/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Prophecy Concepts", - "position": 4, - "collapsible": true, - "collapsed": true -} diff --git a/docs/concepts/concepts.md b/docs/concepts/concepts.md index 3365a7f073..be9c00648b 100644 --- a/docs/concepts/concepts.md +++ b/docs/concepts/concepts.md @@ -1,6 +1,7 @@ --- title: Prophecy Concepts id: key-concepts +sidebar_class_name: hidden description: Key Concepts of Prophecy tags: [] --- diff --git a/docs/concepts/copilot/_category_.json b/docs/concepts/copilot/_category_.json deleted file mode 100644 index 15847a8d6c..0000000000 --- a/docs/concepts/copilot/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Data Copilot", - "position": 5, - "collapsible": true, - "collapsed": true -} diff --git a/docs/concepts/copilot/copilot-ai-capabilities.md b/docs/copilot/copilot-ai-capabilities.md similarity index 100% rename from docs/concepts/copilot/copilot-ai-capabilities.md rename to docs/copilot/copilot-ai-capabilities.md diff --git a/docs/concepts/copilot/copilot-data-privacy.md b/docs/copilot/copilot-data-privacy.md similarity index 100% rename from docs/concepts/copilot/copilot-data-privacy.md rename to docs/copilot/copilot-data-privacy.md diff --git a/docs/concepts/copilot/copilot.md b/docs/copilot/copilot.md similarity index 94% rename from docs/concepts/copilot/copilot.md rename to docs/copilot/copilot.md index 32c859d637..6ff8026a7b 100644 --- a/docs/concepts/copilot/copilot.md +++ b/docs/copilot/copilot.md @@ -19,7 +19,7 @@ Prophecy’s end-to-end platform makes data Pipeline development faster and acce -To learn more about what Data Copilot can do, see [Data Copilot AI capabilities](/concepts/copilot/copilot-ai-capabilities). +To learn more about what Data Copilot can do, see [Data Copilot AI capabilities](/docs/copilot/copilot-ai-capabilities.md). ## Knowledge Graph @@ -29,7 +29,7 @@ Data Copilot works by enhancing the user’s prompt using a knowledge graph. Eac Our knowledge graph includes metadata for a Project’s entities (such as Datasets, Schemas, Seeds, Models, and Pipelines) and the statistical usages of these entities. We built the knowledge graph intentionally to include metadata but not data. The Dataset structure is included in the knowledge graph but individual Dataset records are not. -For more details, see [Data privacy with Data Copilot](/concepts/copilot/copilot-data-privacy). +For more details, see [Data privacy with Data Copilot](/docs/copilot/copilot-data-privacy.md). ![Architecture](img/copilot_arch.png) @@ -39,7 +39,7 @@ Prophecy sends the enhanced Prompt to OpenAI. The large language model (LLM) ret Data Copilot is available for all customers using Prophecy’s managed Public SaaS offering and uses the public SaaS version of OpenAI's Language model. Customers using the Private SaaS or on-prem offerings can enable Data Copilot as a flag in the deployment / upgrade configuration. -For more details, see [Enable Data Copilot](/concepts/copilot/enable-data-copilot). +For more details, see [Enable Data Copilot](/docs/copilot/enable-data-copilot.md). ## FAQ @@ -71,7 +71,7 @@ Copilot features are included with Prophecy’s Spark and SQL offerings. There w #### Can I use my own private OpenAI instance? -Yes! Administrators have the option to connect Prophecy Data Copilot to their private subscription OpenAI from the Kubernetes cluster where Prophecy services are running. For details on how to do this, see [Installation](/concepts/copilot/enable-data-copilot#installation). +Yes! Administrators have the option to connect Prophecy Data Copilot to their private subscription OpenAI from the Kubernetes cluster where Prophecy services are running. For details on how to do this, see [Installation](/docs/copilot/enable-data-copilot.md#installation). #### Does Prophecy Data Copilot support text prompts in languages other than English? diff --git a/docs/concepts/copilot/enable-data-copilot.md b/docs/copilot/enable-data-copilot.md similarity index 91% rename from docs/concepts/copilot/enable-data-copilot.md rename to docs/copilot/enable-data-copilot.md index 51d43e96e6..aa3fb95cd0 100644 --- a/docs/concepts/copilot/enable-data-copilot.md +++ b/docs/copilot/enable-data-copilot.md @@ -10,9 +10,9 @@ tags: - upgrade --- -**Prophecy Data Copilot** is an AI-powered assistant that delivers intelligent suggestions and automates repetitive tasks for visual data transformations. You can read more about it at [Data Copilot](/concepts/copilot). +**Prophecy Data Copilot** is an AI-powered assistant that delivers intelligent suggestions and automates repetitive tasks for visual data transformations. You can read more about it at [Data Copilot](/docs/copilot/copilot.md). -Data Copilot leverages OpenAI's generative AI models to understand user intent, and enriched by the organizations' [knowledge graph](/concepts/copilot#knowledge-graph), to automate repetitive data engineering tasks. By default, Data Copilot leverages **Prophecy's managed OpenAI subscription and is entirely free** for existing Prophecy customers. Prophecy uses user queries and metadata when communicating with OpenAI. Prophecy never sends any customer data to OpenAI. +Data Copilot leverages OpenAI's generative AI models to understand user intent, and enriched by the organizations' [knowledge graph](/docs/copilot/copilot.md#knowledge-graph), to automate repetitive data engineering tasks. By default, Data Copilot leverages **Prophecy's managed OpenAI subscription and is entirely free** for existing Prophecy customers. Prophecy uses user queries and metadata when communicating with OpenAI. Prophecy never sends any customer data to OpenAI. However, for the most security conscious organizations, it is possible to configure Prophecy to use your own OpenAI endpoint. This page describes how to enable Prophecy Data Copilot for private VPC SaaS environments and configure it to use your own OpenAI or Azure OpenAI endpoint. diff --git a/docs/concepts/copilot/img/copilot_arch.png b/docs/copilot/img/copilot_arch.png similarity index 100% rename from docs/concepts/copilot/img/copilot_arch.png rename to docs/copilot/img/copilot_arch.png diff --git a/docs/concepts/copilot/img/copilot_code_suggestion.png b/docs/copilot/img/copilot_code_suggestion.png similarity index 100% rename from docs/concepts/copilot/img/copilot_code_suggestion.png rename to docs/copilot/img/copilot_code_suggestion.png diff --git a/docs/concepts/copilot/img/copilot_knowledge_graph.png b/docs/copilot/img/copilot_knowledge_graph.png similarity index 100% rename from docs/concepts/copilot/img/copilot_knowledge_graph.png rename to docs/copilot/img/copilot_knowledge_graph.png diff --git a/docs/concepts/copilot/img/copilot_next_suggestion.png b/docs/copilot/img/copilot_next_suggestion.png similarity index 100% rename from docs/concepts/copilot/img/copilot_next_suggestion.png rename to docs/copilot/img/copilot_next_suggestion.png diff --git a/docs/concepts/copilot/img/copilot_next_suggestion_expression.png b/docs/copilot/img/copilot_next_suggestion_expression.png similarity index 100% rename from docs/concepts/copilot/img/copilot_next_suggestion_expression.png rename to docs/copilot/img/copilot_next_suggestion_expression.png diff --git a/docs/concepts/copilot/img/copilot_text_to_Pipeline.png b/docs/copilot/img/copilot_text_to_Pipeline.png similarity index 100% rename from docs/concepts/copilot/img/copilot_text_to_Pipeline.png rename to docs/copilot/img/copilot_text_to_Pipeline.png diff --git a/docs/concepts/copilot/img/cross.svg b/docs/copilot/img/cross.svg similarity index 100% rename from docs/concepts/copilot/img/cross.svg rename to docs/copilot/img/cross.svg diff --git a/docs/concepts/copilot/img/data_copilot_open_ai_flow_architecture.png b/docs/copilot/img/data_copilot_open_ai_flow_architecture.png similarity index 100% rename from docs/concepts/copilot/img/data_copilot_open_ai_flow_architecture.png rename to docs/copilot/img/data_copilot_open_ai_flow_architecture.png diff --git a/docs/concepts/copilot/img/tick.svg b/docs/copilot/img/tick.svg similarity index 100% rename from docs/concepts/copilot/img/tick.svg rename to docs/copilot/img/tick.svg diff --git a/docs/deployment/_category_.json b/docs/deployment/_category_.json deleted file mode 100644 index b00f02936d..0000000000 --- a/docs/deployment/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Deployment", - "position": 9, - "collapsible": true, - "collapsed": true -} diff --git a/docs/deployment/deployment.md b/docs/deployment/deployment.md index f22fa7606b..f38f734b6f 100644 --- a/docs/deployment/deployment.md +++ b/docs/deployment/deployment.md @@ -1,6 +1,7 @@ --- title: Deployment id: Deployment +sidebar_class_name: hidden description: Release and Deploy Process. How to Release your Projects and Deploy your Jobs tags: - metadata diff --git a/docs/getting-started/getting-help/getting-help.md b/docs/getting-help/getting-help.md similarity index 99% rename from docs/getting-started/getting-help/getting-help.md rename to docs/getting-help/getting-help.md index 62feb216de..9b0b2370fc 100644 --- a/docs/getting-started/getting-help/getting-help.md +++ b/docs/getting-help/getting-help.md @@ -1,6 +1,7 @@ --- title: Getting help with Prophecy id: getting-help +sidebar_class_name: hidden sidebar_position: 6 description: Getting help with Prophecy sidebar_label: Getting Help diff --git a/docs/getting-started/getting-help/img/cluster_1.png b/docs/getting-help/img/cluster_1.png similarity index 100% rename from docs/getting-started/getting-help/img/cluster_1.png rename to docs/getting-help/img/cluster_1.png diff --git a/docs/getting-started/getting-help/img/cluster_2.png b/docs/getting-help/img/cluster_2.png similarity index 100% rename from docs/getting-started/getting-help/img/cluster_2.png rename to docs/getting-help/img/cluster_2.png diff --git a/docs/getting-started/getting-help/img/cluster_3.png b/docs/getting-help/img/cluster_3.png similarity index 100% rename from docs/getting-started/getting-help/img/cluster_3.png rename to docs/getting-help/img/cluster_3.png diff --git a/docs/getting-started/getting-help/img/cluster_4.png b/docs/getting-help/img/cluster_4.png similarity index 100% rename from docs/getting-started/getting-help/img/cluster_4.png rename to docs/getting-help/img/cluster_4.png diff --git a/docs/getting-started/getting-help/img/cluster_5.png b/docs/getting-help/img/cluster_5.png similarity index 100% rename from docs/getting-started/getting-help/img/cluster_5.png rename to docs/getting-help/img/cluster_5.png diff --git a/docs/getting-started/getting-help/img/pipeline_logs.png b/docs/getting-help/img/pipeline_logs.png similarity index 100% rename from docs/getting-started/getting-help/img/pipeline_logs.png rename to docs/getting-help/img/pipeline_logs.png diff --git a/docs/getting-started/getting-help/img/prophecy_connection_log.png b/docs/getting-help/img/prophecy_connection_log.png similarity index 100% rename from docs/getting-started/getting-help/img/prophecy_connection_log.png rename to docs/getting-help/img/prophecy_connection_log.png diff --git a/docs/getting-started/getting-help/img/prophecy_logs.png b/docs/getting-help/img/prophecy_logs.png similarity index 100% rename from docs/getting-started/getting-help/img/prophecy_logs.png rename to docs/getting-help/img/prophecy_logs.png diff --git a/docs/getting-started/getting-help/prophecy-details.md b/docs/getting-help/prophecy-details.md similarity index 100% rename from docs/getting-started/getting-help/prophecy-details.md rename to docs/getting-help/prophecy-details.md diff --git a/docs/getting-started/getting-help/spark-cluster-details.md b/docs/getting-help/spark-cluster-details.md similarity index 100% rename from docs/getting-started/getting-help/spark-cluster-details.md rename to docs/getting-help/spark-cluster-details.md diff --git a/docs/getting-started/_category_.json b/docs/getting-started/_category_.json deleted file mode 100644 index 68dd306f2a..0000000000 --- a/docs/getting-started/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Getting Started", - "position": 3, - "collapsible": true, - "collapsed": true -} diff --git a/docs/getting-started/getting-help/_category_.json b/docs/getting-started/getting-help/_category_.json deleted file mode 100644 index d56669ef1b..0000000000 --- a/docs/getting-started/getting-help/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Getting Help", - "position": 6, - "collapsible": true, - "collapsed": true -} diff --git a/docs/getting-started/getting-started-sql-snowflake.md b/docs/getting-started/getting-started-sql-snowflake.md index badfa648fd..a49fa472c0 100644 --- a/docs/getting-started/getting-started-sql-snowflake.md +++ b/docs/getting-started/getting-started-sql-snowflake.md @@ -265,7 +265,7 @@ Prophecy makes **interactively testing** the models incredibly easy! Simply: 2. Once the model runs, the **Result** icon appears. 3. Click the Result icon to view a **Sample** set of records. -Notice Copilot is offering suggested fixes when errors appear. See how **Fix with AI** works [here](/docs/concepts/copilot/copilot-ai-capabilities.md#fix-with-ai). Explore suggested fixes in the canvas, inside each transformation Gem, or inside Gem expressions. +Notice Copilot is offering suggested fixes when errors appear. See how **Fix with AI** works [here](/docs/copilot/copilot-ai-capabilities.md#fix-with-ai). Explore suggested fixes in the canvas, inside each transformation Gem, or inside Gem expressions. ## 5. Code view diff --git a/docs/getting-started/getting-started.md b/docs/getting-started/getting-started.md index f130e11fce..df648538c8 100644 --- a/docs/getting-started/getting-started.md +++ b/docs/getting-started/getting-started.md @@ -1,31 +1,35 @@ --- -title: Getting Started -sidebar_position: 3 +title: Quick starts +sidebar_class_name: hidden id: getting-started description: Getting started with Prophecy tags: [] --- -Prophecy offers a few different ways to sign up: public SaaS or private installation in the customer's VPC. +To quickly get started wih Prophecy, you can begin with a free trial. -### Prophecy's public SaaS offering on the cloud +## Prophecy Enterprise SaaS -- [**Prophecy's Enterprise trial**](https://app.prophecy.io/metadata/auth/signup) - try out Prophecy's software-as-a-service offering free for for 21 days. Connect to your data, or opt for the Prophecy-provided Databricks account. - [![Signup](./img/Snow1_signup.png)](https://app.prophecy.io/metadata/auth/signup) +Begin a free 21-day trial by [signing up](https://app.prophecy.io/metadata/auth/signup). You'll be able to use your own data, or you can opt to use Prophecy-managed Databricks. -- **Prophecy via Databricks Partner Connect** - try out Prophecy's SaaS offering via an existing Databricks account. The following video shows how to get started with Prophecy using _Partner Connect_ from your Databricks UI. +## Prophecy via Databricks Partner Connect + +You can also try out Prophecy's SaaS offering via an existing Databricks account. The following video shows how to get started with Prophecy using _Partner Connect_ from your Databricks interface.
+
+ +:::info -### Prophecy's private offering installed in customer VPC +For more permanent deployment options, visit our page on [Prophecy deployment](docs/architecture/deployment/deployment.md). -- **Prophecy's Private Cloud** - [reach out](https://www.prophecy.io/request-a-demo) to explore Prophecy's Private cloud offering in your VPC. The installation is very simple, takes about 20 minutes (with a confirmation popup), and billing starts after 30 days. +::: ### Next steps -Prophecy can connect to your data wherever it resides, in various formats and storage options. Learn about Prophecy [concepts](/docs/concepts/concepts.md), read more about [deployment options](/docs/architecture/deployment/deployment.md), or reach out to [start a conversation](./getting-help/getting-help.md) today! +Try out these quick starts to get yourself up and running with Prophecy! ```mdx-code-block import DocCardList from '@theme/DocCardList'; diff --git a/docs/index.md b/docs/index.md index 7911e1a302..b6cff73440 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,7 +11,7 @@ tags: --- Prophecy empowers users of all skill levels to excel in data engineering. -Anyone can use the visual interface, especially with the help of the [Data Copilot](./concepts/copilot/), to achieve results that go beyond traditional ETL tools. +Anyone can use the visual interface, especially with the help of the [Data Copilot](/docs/copilot/copilot.md/), to achieve results that go beyond traditional ETL tools. Below, we highlight Prophecy's core pillars. ## Visual interface @@ -33,7 +33,7 @@ Prophecy provides dozens of Gems ready to use out of the box. Gems consist of Sp ## Code and DevOps -Running at scale requires applying strong software engineering practices to data refinement. Rapid development and deployment of data pipelines can be achieved by using code stored in Git, maintaining high test coverage, and implementing [continuous integration and continuous deployment](./tutorials/Orchestration/reliable-ci-cd/). In Prophecy, this looks like: +Running at scale requires applying strong software engineering practices to data refinement. Rapid development and deployment of data pipelines can be achieved by using code stored in Git, maintaining high test coverage, and implementing [continuous integration and continuous deployment](./Orchestration/reliable-ci-cd/). In Prophecy, this looks like: - **Pipelines stored as code.** Prophecy's code editor stores visual data Pipelines as high-quality code on Git. - **High test coverage.** Prophecy makes test generation and editing easy. diff --git a/docs/metadata/_category_.json b/docs/metadata/_category_.json deleted file mode 100644 index 3433bfebdf..0000000000 --- a/docs/metadata/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Metadata", - "position": 5, - "collapsible": true, - "collapsed": true -} diff --git a/docs/metadata/metadata.md b/docs/metadata/metadata.md index 40dbfc6fab..b5b1ecacd8 100644 --- a/docs/metadata/metadata.md +++ b/docs/metadata/metadata.md @@ -2,6 +2,7 @@ title: Metadata id: metadata description: Metadata +sidebar_class_name: hidden tags: [] --- diff --git a/docs/package-hub/_category_.json b/docs/package-hub/_category_.json deleted file mode 100644 index d045b0a3a0..0000000000 --- a/docs/package-hub/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Package Hub", - "position": 10, - "collapsible": true, - "collapsed": true -} diff --git a/docs/package-hub/package-hub.md b/docs/package-hub/package-hub.md index 88f842e015..f727644642 100644 --- a/docs/package-hub/package-hub.md +++ b/docs/package-hub/package-hub.md @@ -2,6 +2,7 @@ title: Package Hub id: package-hub description: Create and Share Reusable Pipeline Components +sidebar_class_name: hidden tags: [package-hub] --- diff --git a/docs/release_notes/_category_.json b/docs/release_notes/_category_.json deleted file mode 100644 index 0686d4d177..0000000000 --- a/docs/release_notes/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Release Notes", - "position": 12, - "collapsible": true, - "collapsed": true -} diff --git a/docs/release_notes/release_notes.md b/docs/release_notes/release_notes.md index cf18266650..ef9217bb88 100644 --- a/docs/release_notes/release_notes.md +++ b/docs/release_notes/release_notes.md @@ -2,6 +2,7 @@ title: Release Notes id: release_notes description: Prophecy release notes +sidebar_class_name: hidden tags: [] --- diff --git a/docs/settings/_category_.json b/docs/settings/_category_.json deleted file mode 100644 index 6dbb11af3f..0000000000 --- a/docs/settings/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Settings", - "position": 10, - "collapsible": true, - "collapsed": true -} diff --git a/docs/settings/settings.md b/docs/settings/settings.md index a2a2e405a1..7b2de0fffa 100644 --- a/docs/settings/settings.md +++ b/docs/settings/settings.md @@ -2,6 +2,7 @@ title: Settings id: settings description: This page describes the Settings pages of Prophecy +sidebar_class_name: hidden tags: - settings - admin diff --git a/docs/tutorials/Orchestration/_category_.json b/docs/tutorials/Orchestration/_category_.json deleted file mode 100644 index d291d70bfb..0000000000 --- a/docs/tutorials/Orchestration/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Orchestration tutorials", - "position": 2, - "collapsible": true, - "collapsed": true -} diff --git a/docs/tutorials/Orchestration/orchestration.md b/docs/tutorials/Orchestration/orchestration.md deleted file mode 100644 index c2c128450c..0000000000 --- a/docs/tutorials/Orchestration/orchestration.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Orchestration tutorials -id: orchestration-tutorials -description: Orchestration tutorials and approaches to solving certain common Data Engineering problems within Prophecy -tags: [] ---- - -Orchestration tutorials include the following topics: - -```mdx-code-block -import DocCardList from '@theme/DocCardList'; -import {useCurrentSidebarCategory} from '@docusaurus/theme-common'; - - -``` diff --git a/docs/tutorials/Spark/excel.md b/docs/tutorials/Spark/excel.md deleted file mode 100644 index a3af5184d7..0000000000 --- a/docs/tutorials/Spark/excel.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: Working with XLSX (Excel) files -id: working-with-excel -description: Explore the basics of working with XLSX (Excel) files with Prophecy -sidebar_position: 1 -tags: - - gems - - devops - - file - - tutorial - - xlsx ---- - -If you've worked with numbers in your day-to-day operations, odds are you've run into a need to use Excel at one point or another. This tutorial is going to cover the two most basic scenarios: Reading and Writing. - -:::info -For a full list of options supported by Prophecy when interacting with Excel files [see here](/Spark/gems/source-target/file/xlsx.md) -::: - -## Reading XLSX files - -Reading an Excel file is quite easy in Prophecy! Simply follow these steps to create a new XLSX source. - -1. Select the XLSX format
![Select XLSX source type](img/xlsx_src_1.png)
-2. Navigate to the desired XLSX source file
![Navigate to XLSX source file](img/xlsx_src_2.png)
-3. Customize any properties you might need and tweak the schema to your liking
![Adjust parameters and schema](img/xlsx_src_3.png)
-4. Preview the file and double-check that the schema matches your intentions
![Preview the input file](img/xlsx_src_4.png) - -Once the `Source` Gem is created and validation passes you'll be able to find the code of your new `Source` in the `graph` directory of your Pipeline code. - -````mdx-code-block -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - - -```py -def Source_0(spark: SparkSession) -> DataFrame: - if Config.fabricName == "demos": - return spark.read\ - .format("excel")\ - .option("header", True)\ - .option("dataAddress", "A1")\ - .option("inferSchema", True)\ - .load("dbfs:/FileStore/Users/scott/plain_number.xlsx") - else: - raise Exception("No valid dataset present to read fabric") - -``` - - - -```` - -## Writing XLSX files - -Writing an Excel file is just as easy, with only one small caveat to be discussed after. Let's look at an example Pipeline with an XLSX output target: - -![Sample Pipeline with XLSX output](img/xlsx_tgt_0.png) - -In this scenario we're building a report of spending by customer and want an XLSX file as output. - -1. Select the XLSX format
![Select the XLSX target type](img/xlsx_tgt_1.png)
-2. Navigate to the target location
![Select destination filename](img/xlsx_tgt_2.png)
-3. Customize any properties needed when writing the output file
![Set output parameters](img/xlsx_tgt_3.png)
-4. Run the Pipeline! - -Once the `Target` Gem is created and validation passes you'll be able to find the code of your new `Target` in the `graph` directory of your Pipeline code. - -````mdx-code-block - - - - -```py -def ExcelTarget(spark: SparkSession, in0: DataFrame): - if Config.fabricName == "demos": - in0.write\ - .format("excel")\ - .option("header", True)\ - .option("dataAddress", "A1")\ - .option("usePlainNumberFormat", False)\ - .mode("overwrite")\ - .save("dbfs:/FileStore/Users/scott/customers.xlsx") - else: - raise Exception("No valid dataset present to read fabric") -``` - - - -```` - -### Writing a single output file - -As mentioned above, there's a caveat when working with any text-based files in Spark. Because of the distributed nature of the framework, you'll find that your output file is not just a single output file but instead a directory with multiple separately partitioned files within it. - -For example, using `dbfs:/FileStore/Users/scott/customers.xlsx` as my Target location I can see the following in DBFS after running my Pipeline: - -1. `customers.xlsx` is, in reality, a directory...
![Target output is a directory](img/xlsx_tgt_4.png)
-2. ... that contains multiple partitions within it
![Target output is partitioned](img/xlsx_tgt_5.png)
- -Each file within this directory will be a separate valid XLSX file with a segment of the overall output data. If you want to output only a single file, you'll need to change your Pipeline as such: - -1. Add a `Repartition` Gem in `Coalesce` mode with the `Partition Count` set to `1`.
![Coalesce using Repartition](img/xlsx_tgt_5.5.png)
-2. Connect it between your second-to-last transformation and the `Target` Gem
![Attach coalesce before desired target](img/xlsx_tgt_6.png)
- -After running, your output will still be a directory, but this time it will only contain a single output file. - -![Single output after coalesce](img/xlsx_tgt_7.png) diff --git a/docs/tutorials/Spark/img/xlsx_src_1.png b/docs/tutorials/Spark/img/xlsx_src_1.png deleted file mode 100644 index 50a797331c..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_src_1.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_src_2.png b/docs/tutorials/Spark/img/xlsx_src_2.png deleted file mode 100644 index ca78c7f93a..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_src_2.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_src_3.png b/docs/tutorials/Spark/img/xlsx_src_3.png deleted file mode 100644 index f78d17d337..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_src_3.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_src_4.png b/docs/tutorials/Spark/img/xlsx_src_4.png deleted file mode 100644 index c9d46816b4..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_src_4.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_0.png b/docs/tutorials/Spark/img/xlsx_tgt_0.png deleted file mode 100644 index e8853eaee9..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_0.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_1.png b/docs/tutorials/Spark/img/xlsx_tgt_1.png deleted file mode 100644 index 8938df5069..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_1.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_2.png b/docs/tutorials/Spark/img/xlsx_tgt_2.png deleted file mode 100644 index 7314fe6e1a..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_2.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_3.png b/docs/tutorials/Spark/img/xlsx_tgt_3.png deleted file mode 100644 index b15d9423f1..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_3.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_4.png b/docs/tutorials/Spark/img/xlsx_tgt_4.png deleted file mode 100644 index 6a59b29c45..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_4.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_5.png b/docs/tutorials/Spark/img/xlsx_tgt_5.png deleted file mode 100644 index 968c667cea..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_5.png and /dev/null differ diff --git a/docs/tutorials/Spark/img/xlsx_tgt_7.png b/docs/tutorials/Spark/img/xlsx_tgt_7.png deleted file mode 100644 index 207c52d2fe..0000000000 Binary files a/docs/tutorials/Spark/img/xlsx_tgt_7.png and /dev/null differ diff --git a/docs/tutorials/Spark/spark.md b/docs/tutorials/Spark/spark.md deleted file mode 100644 index 1d389d6f87..0000000000 --- a/docs/tutorials/Spark/spark.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Spark tutorials -id: spark-tutorials -description: Spark tutorials and approaches to solving certain common Data Engineering problems within Prophecy -tags: [] ---- - -Spark tutorials include the following topic: - -```mdx-code-block -import DocCardList from '@theme/DocCardList'; -import {useCurrentSidebarCategory} from '@docusaurus/theme-common'; - - -``` diff --git a/docs/tutorials/_category_.json b/docs/tutorials/_category_.json deleted file mode 100644 index 65c54ae99d..0000000000 --- a/docs/tutorials/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Tutorials", - "position": 13, - "collapsible": true, - "collapsed": true -} diff --git a/docs/tutorials/tutorials.md b/docs/tutorials/tutorials.md deleted file mode 100644 index 808364afd3..0000000000 --- a/docs/tutorials/tutorials.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Tutorials -id: tutorials -description: Tutorials and approaches to solving certain common Data Engineering problems within Prophecy -tags: [] ---- - -Tutorials include the following topics: - -```mdx-code-block -import DocCardList from '@theme/DocCardList'; -import {useCurrentSidebarCategory} from '@docusaurus/theme-common'; - - -``` diff --git a/docs/tutorials/videos/_category_.json b/docs/tutorials/videos/_category_.json deleted file mode 100644 index 9bcc3bd1ee..0000000000 --- a/docs/tutorials/videos/_category_.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "Video tutorials", - "position": 3, - "collapsible": true, - "collapsed": true -} diff --git a/docs/tutorials/videos/design-pipeline.md b/docs/tutorials/videos/design-pipeline.md deleted file mode 100644 index 55f425e308..0000000000 --- a/docs/tutorials/videos/design-pipeline.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: Design a Pipeline -id: design-Pipeline -description: How to design your first Pipeline -sidebar_position: 2 -tags: - - Pipelines - - tutorial ---- - -
-
- -
- - -### Summary - -Design a data transformation Pipeline and generate a report - all using a visual to code interface for Apache Spark. - -### Description - -Using a visual interface, ingest a shipments Dataset and create a pricing summary report. Read and write from multiple data sources, including Snowflake and Delta Catalog Table. Run the Pipeline interactively and see the Job in Databricks. View the generated code - either Python or Scala - which runs on Apache Spark. In the next trainings, we'll see how to commit this code to Git, version our changes, schedule, and test our Pipeline. - -### Transcript - -[Connect to Git](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=0s) -Let’s get started on Prophecy for Databricks. After logging into Prophecy, create a project called Reporting. All the Pipelines that you’re going to build are turned into high quality code. Here you can choose the programming language of that code - either Python or Scala. - -Prophecy will store all of that code in repositories on Git. Git enables you to version all of your changes, collaborate easily with your team, and track exactly what code is deployed to production. You can connect to one of your existing Git repositories. If you don’t have one, Prophecy can create one for you. - -[Create Pipeline](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=53s) -As part of our Reporting project, let’s create our pricingReport Pipeline. Connect to a Spark Cluster. With one click, we can see our Spark cluster running in Databricks. - -[Overview and Define Source Data](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=80s) -Coming back to the Prophecy UI; we’re going to build a Pipeline to report the amount of business that was billed, shipped, and returned. With Prophecy I can read and write to any data source. We’re going to read from Snowflake. I store my credentials as Databricks secrets. Read from the ordershipments table. This table contains information about each order, whether the order was billed, shipped, or returned. We can see the schema right away. Load a data preview - the data looks as expected. Each record is an item to be shipped. We’ll use the columns relating to price and shipping status. - -[Choose Transformations](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=140s) -Let’s start to design our Pipeline by choosing some transformations; a Reformat Gem to Cleanup the Data, an Aggregate Gem to Sum the Amounts,an OrderBy Gem to OrderBy Shipment Status, then a Target Gem to write the Report to a Catalog table. - -[Build Custom Expressions](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=169s) -Configure each transformation. Select the columns of interest for our pricing report. Create some expressions to cleanup the Dataset. If the Tax is null, specify a default tax rate. Let’s also create a column to capture a business rule: a 'case when' statement marking an item as clearance. - -Configure the aggregate expressions. Start with basic SQL functions, and Prophecy will help you build expressions. Later Prophecy will convert these SparkSQL expressions into Python or Scala. Compute a sum of prices, discounts, and tax. Count the orders and whether the item was marked Clearance. Group-by whether the item was returned, and whether the item was delivered. - -[Interactive Execution](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=238s) -Run the Pipeline upto the sumAmounts Gem. Organize the Gems on the canvas. Let’s see what we’ve got so far. We can see the interim sample data output from each Gem. Data types are correct. We can see the summed amounts and orders, the returned or delivery statuses, and how many of these orders were marked clearance. We can see some basic statistics for each column. Configure the OrderBy Gem. We want to know if the item was returned and/or delivered. - -[Write to Delta Catalog Table](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=291s) -Configure the target Dataset. We’ll choose to write to a Delta Catalog table. Specify the details and some properties. Here we’ll overwrite the table schema, but there are lots of options. Run the Pipeline one final time to write the report to a Catalog table. - -[Toggle to View Code](https://fast.wistia.net/embed/channel/s98lbj0pfs?wchannelid=s98lbj0pfs&wvideoid=i61o34x245&wtime=326s) -We designed our Pipeline! Let’s see what the code looks like behind the scenes. Here is the graph representation, each function represents a Gem; the shipments Dataset, the cleanup function, the sumAmounts function. See the Cleanup function code; this is what you write as a highly skilled data engineer. - -Great! -In the next few trainings, we’ll see how to commit our code to Git, version our changes, schedule and test our Pipeline. - -See you next time! - -### Follow along checklist - -Create a repository. - -Snowflake and Databricks credentials are used here, but you can read/write to the data source(s) to which you have credentials. Setup [Databricks Secrets](https://docs.databricks.com/security/secrets/secrets.html#create-a-secret-in-a-databricks-backed-scope) to avoid exposing secrets when the project is committed to Git. - -Set Prophecy credentials while signing up for a free trial here: [App.Prophecy.io](https://App.Prophecy.io/) - -The Shipments Dataset is actually a table called ORDERSHIPMENTS in the TPC-H Dataset, and is available as sample data in Snowflake, Databricks File System, and many other data sources. The column names were edited for clarity. - -Go for it! Follow the steps outlined in the video above; ask questions on Prophecy's [Slack](https://join.slack.com/t/prophecy-io-support/shared_invite/zt-moq3xzoj-~5MSJ6WPnZfz7bwsqWi8tQ). When you are done, your repository should look something like [mine](https://Github.com/SimpleDataLabsInc/ShippingReports). diff --git a/docs/tutorials/videos/schedule-pipeline.md b/docs/tutorials/videos/schedule-pipeline.md deleted file mode 100644 index 5a1c2985c3..0000000000 --- a/docs/tutorials/videos/schedule-pipeline.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -title: Schedule a Pipeline -id: schedule-Pipeline -description: How to schedule a versioned Prophecy Pipeline on Databricks Workflows. -sidebar_position: 2 -tags: - - Pipelines - - tutorial - - schedule - - Job - - databricksworkflow - - Git - - commit - - pullrequest - - merge - - release - - version ---- - -
-
- -
- - -### Summary - -Schedule a data Pipeline on Databricks using best software development practices. - -### Description - -Schedule a data Pipeline using best software development practices: commit to Git, pull request, merge, release a version of the Pipeline, see the scheduled Job in Databricks Workflows. - -### Transcript - -[Overview](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=7s?seo=false?videoFoam=true) -Welcome back! Today we’ll schedule our Pipeline which is integrated with Databricks Jobs. You’ll see everything is encoded on Git. - -In the last training, we designed a Pipeline and we committed our code to Git. We started with a repository and a main branch. Prophecy created a feature branch and we committed our code to the feature branch. So the feature branch, dev/Anya, contains our Pipeline which reads a Dataset from snowflake, transforms the data, and writes a report to a Delta Catalog table. - -Today, we’ll schedule our Pipeline and commit our schedule details to our feature branch. - -[Create Schedule](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=1m3s?seo=false?videoFoam=true) -It’s very easy to design and commit to Git using the best software development practices in Prophecy’s visual canvas. From our Pipeline, create a schedule to run the Pipeline at regular intervals and setup alerts. - -[Git Branches](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=1m30s?seo=false?videoFoam=true) -Enable the Job, and prepare to release. Prophecy makes it easy for everyone to work with Git. On the left is our feature branch; on the right is our main branch. The main branch is where your final code should exist. We have done some new work, designing a new Pipeline and new schedule for the Pipeline. Let’s commit our work to our feature branch. - -[Commit, pull request, merge](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=1m57s?seo=false?videoFoam=true) -With this commit, the feature branch contains both the Pipeline and the schedule. We like our Pipeline, we want our Pipeline and schedule to be a part of the main codebase. - -We could merge straightaway, but the best practice is to create a pull request so that our team can review our changes. Create the pull request according to your team’s standards. We’ve defined our pull request template in Prophecy here. If there are no further changes needed, let’s merge the feature branch to the main branch. - -[Release scheduled Job and version tag](https://Prophecy-1.wistia.com/medias/dvayf1k9us?wtime=3m9s) -Now the main branch contains the Pipeline and the scheduled Job. Let’s release the Job to run in Databricks and make sure everything is included in a specific version tag. That way we know exactly what Pipeline is running in our production environment. - -We have released the Job to run in Databricks workflows. At the same time, We have created a versioned release of our codebase. So, our Pipeline and schedule are now on the main branch, and a versioned tag has been created. - -[CI/CD](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=4m9s?seo=false?videoFoam=true) -You’re probably curious if Prophecy’s deployments integrate with your jenkins, GitHub actions, or existing deployment toolset. YES! Use Prophecy Build tool to integrate with these in-house CI/CD tools. - -[Prophecy Build Tool](https://fast.wistia.net/embed/iframe/dvayf1k9us?wtime=4m9s?seo=false?videoFoam=true) (available [here](https://Github.com/SimpleDataLabsInc/Prophecy-build-tool)) - -- Triggers on every change that is pushed to the main branch - -- Sets the variables and dependencies - -- Builds the Pipelines and generates a jar or wheel file - -- Runs all unit tests - -- If the tests pass, Prophecy build tool will deploy a jar or wheel file to the configured Databricks location - -That concludes this training! We learned how to schedule our Pipeline, ommit to our feature branch, merge the feature branch with the main branch, release a version of our project. Now we know exactly what code will be deployed and we can see the Job running on Databricks. - -In the next training, we’ll see how to test our Pipeline. -See you next time! - -### Follow along checklist - -Databricks and Prophecy credentials are needed. Set Prophecy credentials while signing up for a free trial here: [App.Prophecy.io](https://App.Prophecy.io/). If you don't have Databricks credentials, the Prophecy app will guide you to use our "managed" Databricks cluster during your 14 day free trial. - -My repository is located [here](https://Github.com/SimpleDataLabsInc/ShippingReports); I just created an empty repository, designed Pipeline, scheduled, and committed the generated code from the Prophecy UI. - -Go for it! Follow the steps outlined in the video above; ask questions on Prophecy's [Slack](https://join.slack.com/t/Prophecy-io-support/shared_invite/zt-moq3xzoj-~5MSJ6WPnZfz7bwsqWi8tQ) diff --git a/docs/tutorials/videos/test-pipeline.md b/docs/tutorials/videos/test-pipeline.md deleted file mode 100644 index 1e9a238bdf..0000000000 --- a/docs/tutorials/videos/test-pipeline.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: Test a Pipeline -id: test-Pipeline -description: How to add unit tests to a Pipeline -sidebar_position: 3 -tags: - - Pipelines - - tutorial - - test - - custom - - unit ---- - -
-
- -
- - -### Summary - -Generate unit tests to ensure each transformation functions as expected. - -### Description - -Generate unit tests using Prophecy's template test builder, run tests and check results all in Prophecy's visual interface. Create custom data samples or configure Prophecy to generate a data sample based on your Pipeline. Go further with custom predicate expressions. Commit your tests, and Prophecy will run the tests any time the main branch is updated. Maintain tests in Prophecy and gate releases based on test outcomes. Achieve greater test coverage with Prophecy! - -### Transcript - -Welcome back! In the last trainings, we designed a Pipeline, scheduled the Pipeline, and now we’ll test the Pipeline. Let’s get started! - -[Basic test: Output Rows Equality](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=18s?seo=false?videoFoam=true) -We’ll start with a test to make sure our aggregate function works as expected. Our SumAmounts Gem groups by return and delivery status. We’ll focus on the SUM_QTY column for simplicity. Here we’ll use the first option "Output rows equality." We will define input data and output data so that our Aggregate function always transforms the input data into the output data. (If preferred, there’s an option to generate a data sample from the larger Dataset.) Select the columns you’d like to include in the test. Notice the input and output Datasets have been started for us. These are the input columns that are needed to compute our desired output columns. Remember this Gem should group by returnflag and delivery status. Let’s say our return flags could be A or B, and our delivery status could be C or D. For simplicity, let’s say each order has quantity of one. We have created our input sample data. Now let’s create the output sample data. We expect the data should be grouped by return flag, then by delivery status. The quantity from each group is summed. - -[Run the basic test](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=1m47s?seo=false?videoFoam=true) -We have created our test. Run the test right here in Prophecy. The test passed! So our Aggregate function, called SumAmounts, is working exactly as we thought. - -[See the basic test code](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=2m3s?seo=false?videoFoam=true) -Let’s see what the code looks like behind the scenes. For this unit test here is the input data sample and the output data sample. This test computes sumAmounts given the input DataFrame and asserts the columns match the expected output Dataset. Wow, so that’s it! Creating a test in Prophecy is easy. Commit the code and your tests can be used to gate your releases. - -[Custom test: Output predicates](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=2m39s?seo=false?videoFoam=true) -For those Prophecy users who want to dig deeper, we have a bonus segment to explore customizing tests. Here we’ll create a unit test for the Cleanup Gem. I created the expression here which handles erroneous zero tax values and replaces them with a default value. So this is the expression i want to test. Last time we created a test using the "Output rows equality" option, wherein we specify the input and output Datasets. Here we’ll customize our test with "Output predicates." Select the columns of interest. Define the expression your function should satisfy. Define some sample rows. Hopefully our function will replace the zero with the default value. - -[Run the custom test](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=3m43s?seo=false?videoFoam=true) -We have created our test! Let’s run it right here in Prophecy. It passed! Before we save our work I want to find a scenario where I can make my test fail. If I remove this expression in my Cleanup Function - if I just pass the input Tax as it is with the zeros included - then my tests should fail. The predicate expression is not always true for this input Dataset. Great! Let’s return the expression to cleanup the data in the Cleanup Gem and save our work. - -[See the custom test code](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=4m28s?seo=false?videoFoam=true) -Let’s see this code behind the scenes! Assert the computed output satisfies our custom predicate. - -[Recap](https://fast.wistia.net/embed/iframe/hs4r7qlsxo?wtime=4m38s?seo=false?videoFoam=true) -Wow, we created unit tests for the SumAmounts and Cleanup functions. We used two types of tests, including custom predicates, to make sure our functions are working as designed. Just commit this new code, merge, and release a version tag, exactly as in the previous video training. Your tests will be included in your Pipeline release sequence, so you can be alerted if your functions stop passing these code quality tests. Nice! In the next training we’ll see how to monitor our Pipelines over time. - -See you next time! - -### Follow along checklist - -Databricks and Prophecy credentials are needed. Set Prophecy credentials while signing up for a free trial here: [App.Prophecy.io](https://App.Prophecy.io/). If you don't have Databricks credentials, the Prophecy app will guide you to use our "managed" Databricks cluster during your 14 day free trial. - -My repository is located [here](https://Github.com/SimpleDataLabsInc/ShippingReports); I just created an empty repository, designed, scheduled, and tested my Pipeline as shown in Prophecy, then committed the generated code from the Prophecy UI. - -Go for it! Follow the steps outlined in the video above; ask questions on Prophecy's [Slack](https://join.slack.com/t/Prophecy-io-support/shared_invite/zt-moq3xzoj-~5MSJ6WPnZfz7bwsqWi8tQ) diff --git a/docs/tutorials/videos/videos.md b/docs/tutorials/videos/videos.md deleted file mode 100644 index b388faca22..0000000000 --- a/docs/tutorials/videos/videos.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Video tutorials -id: video-tutorials -description: Video tutorials and approaches to solving certain common Data Engineering problems within Prophecy -tags: [] ---- - -Video tutorials include the following topics: - -```mdx-code-block -import DocCardList from '@theme/DocCardList'; -import {useCurrentSidebarCategory} from '@docusaurus/theme-common'; - - -``` diff --git a/docusaurus.config.js b/docusaurus.config.js index b87c8d0ea5..eb23cded8b 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -53,6 +53,11 @@ const config = { zoom: { selector: ".markdown :not(em) > img", }, + docs: { + sidebar: { + autoCollapseCategories: true, + }, + }, navbar: { title: "", logo: { @@ -132,19 +137,19 @@ const config = { from: "/developer/videos/", }, { - to: "/getting-started/getting-help/spark-cluster-details", + to: "/getting-help/spark-cluster-details", from: "/getting-started/getting-help/logs/config-sparkui", }, { - to: "/getting-started/getting-help/prophecy-details", + to: "/getting-help/prophecy-details", from: "/getting-started/getting-help/logs/", }, { - to: "/getting-started/getting-help/spark-cluster-details", + to: "/getting-help/spark-cluster-details", from: "/getting-started/getting-help/logs/config-sparknotebook", }, { - to: "/concepts/copilot/enable-data-copilot", + to: "/copilot/enable-data-copilot", from: "/architecture/deployment/enable-data-copilot", }, { @@ -219,6 +224,42 @@ const config = { to: "/architecture/self-hosted/installation-helm/", from: "/architecture/deployment/installation-guide", }, + { + to: "/Orchestration/multi-jobs-trigger", + from: "/tutorials/Orchestration/multi-jobs-trigger", + }, + { + to: "/Orchestration/reliable-ci-cd", + from: "/tutorials/Orchestration/reliable-ci-cd", + }, + { + to: "/Orchestration/", + from: "/tutorials/Orchestration/", + }, + { + to: "/Spark/gems/source-target/file/xlsx", + from: "/tutorials/Spark/excel", + }, + { + to: "/Spark/", + from: "/tutorials/Spark/", + }, + { + to: "/Spark/", + from: "/tutorials/videos/design-pipeline", + }, + { + to: "/Spark/", + from: "/tutorials/videos/schedule-pipeline", + }, + { + to: "/Spark/", + from: "/tutorials/videos/test-pipeline", + }, + { + to: "/Spark/", + from: "/tutorials/videos/", + }, ], /* @@ -258,6 +299,17 @@ const config = { if (existingPath.includes("/Orchestration")) { return [existingPath.replace("/Orchestration", "/low-code-jobs")]; } + if (existingPath.includes("/getting-help")) { + return [ + existingPath.replace( + "/getting-help", + "/getting-started/getting-help", + ), + ]; + } + if (existingPath.includes("/copilot")) { + return [existingPath.replace("/copilot", "/concepts/copilot")]; + } return undefined; }, }, diff --git a/sidebars.js b/sidebars.js index fe90ecab4e..adf0410d96 100644 --- a/sidebars.js +++ b/sidebars.js @@ -14,18 +14,136 @@ /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure - defaultSidebar: [{ type: "autogenerated", dirName: "." }], + // defaultSidebar: [{ type: "autogenerated", dirName: "." }], // But you can create a sidebar manually - /* - tutorialSidebar: [ + mySidebar: [ + "index", { - type: 'category', - label: 'Tutorial', - items: ['hello'], + type: "html", + value: '', + }, + { + type: "html", + className: "sidebar-title", + value: "Overview", + defaultStyle: true, + }, + { + type: "category", + label: "Quick starts", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "getting-started/getting-started" }, + items: [{ type: "autogenerated", dirName: "getting-started" }], + }, + { + type: "category", + label: "Concepts", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "concepts/key-concepts" }, + items: [{ type: "autogenerated", dirName: "concepts" }], + }, + { + type: "category", + label: "Data Copilot", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "copilot/copilot" }, + items: [{ type: "autogenerated", dirName: "copilot" }], + }, + { + type: "category", + label: "Release notes", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "release_notes/release_notes" }, + items: [{ type: "autogenerated", dirName: "release_notes" }], + }, + { + type: "html", + value: '', + }, + { + type: "html", + className: "sidebar-title", + value: "Guides", + defaultStyle: true, + }, + { + type: "category", + label: "Metadata", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "metadata/metadata" }, + items: [{ type: "autogenerated", dirName: "metadata" }], + }, + { + type: "category", + label: "Spark", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "Spark/copilot-for-spark-users" }, + items: [{ type: "autogenerated", dirName: "Spark" }], // Lowercase "spark" + }, + { + type: "category", + label: "SQL", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "SQL/copilot-for-sql-users" }, + items: [{ type: "autogenerated", dirName: "SQL" }], // Lowercase "sql" + }, + { + type: "category", + label: "Orchestration", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "Orchestration/Orchestration" }, + items: [{ type: "autogenerated", dirName: "Orchestration" }], // Lowercase + }, + { + type: "category", + label: "Release and deploy", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "deployment/Deployment" }, + items: [{ type: "autogenerated", dirName: "deployment" }], + }, + { + type: "category", + label: "Package Hub", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "package-hub/package-hub" }, + items: [{ type: "autogenerated", dirName: "package-hub" }], + }, + { + type: "doc", + id: "settings/settings", + }, + { + type: "category", + label: "Architecture", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "architecture/architecture" }, + items: [{ type: "autogenerated", dirName: "architecture" }], + }, + { + type: "html", + value: '', + }, + { + type: "category", + label: "Getting help", + collapsible: true, + collapsed: true, + link: { type: "doc", id: "getting-help/getting-help" }, + items: [{ type: "autogenerated", dirName: "getting-help" }], }, ], - */ }; module.exports = sidebars; diff --git a/src/css/custom.css b/src/css/custom.css index df55a01247..8074a77a60 100644 --- a/src/css/custom.css +++ b/src/css/custom.css @@ -183,3 +183,22 @@ Gems table page line-height: 1; padding: var(--ifm-badge-padding-vertical) var(--ifm-badge-padding-horizontal); } + +/* Sidebar styles from https://github.com/facebook/docusaurus/pull/6519 */ +.sidebar-title { + font-size: 0.8rem; + letter-spacing: 0.05rem; + text-transform: uppercase; + font-weight: 600; + color: var(--ifm-color-gray-600); +} + +.sidebar-divider { + border-top: 1px solid var(--ifm-color-gray-200); + display: block; + margin: 0.5rem 0.5rem 0.25rem; +} + +.hidden { + display: none !important; +}