From a44b268554856bac1a86f5a4d73a87b8647df45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Thu, 22 Sep 2022 20:24:29 +0200 Subject: [PATCH 1/8] document structure, minor fixes --- docs/Deploy/_category_.json | 4 ---- docs/configuration/_category_.json | 4 ++++ .../pipeline-configuration-files.mdx | 6 ++++++ docs/connectors/_category_.json | 4 ++++ docs/connectors/behavior.mdx | 7 +++++++ docs/connectors/installing.mdx | 6 ++++++ docs/connectors/opencdc.mdx | 6 ++++++ docs/deploy/_category_.json | 4 ++++ docs/{Deploy => deploy}/aws_ec2.mdx | 0 docs/{Deploy => deploy}/overview.mdx | 0 .../{architecture.md => architecture.mdx} | 9 ++++----- .../introduction/{glossary.md => glossary.mdx} | 0 .../{getting-started.mdx => index.mdx} | 7 ++----- docs/introduction/known-limitations.mdx | 14 ++++---------- docs/introduction/plugins.mdx | 8 +++++--- docs/introduction/what-is-conduit.mdx | 18 +++++++++++++----- docs/processors/_category_.json | 4 ++++ docs/processors/getting-started.md | 7 +++++++ docusaurus.config.js | 7 +++---- src/sidebars/sidebars.js | 2 +- src/theme/HomeLayout/NavBar.tsx | 2 +- 21 files changed, 81 insertions(+), 38 deletions(-) delete mode 100644 docs/Deploy/_category_.json create mode 100644 docs/configuration/_category_.json create mode 100644 docs/configuration/pipeline-configuration-files.mdx create mode 100644 docs/connectors/_category_.json create mode 100644 docs/connectors/behavior.mdx create mode 100644 docs/connectors/installing.mdx create mode 100644 docs/connectors/opencdc.mdx create mode 100644 docs/deploy/_category_.json rename docs/{Deploy => deploy}/aws_ec2.mdx (100%) rename docs/{Deploy => deploy}/overview.mdx (100%) rename docs/introduction/{architecture.md => architecture.mdx} (90%) rename docs/introduction/{glossary.md => glossary.mdx} (100%) rename docs/introduction/{getting-started.mdx => index.mdx} (87%) create mode 100644 docs/processors/_category_.json create mode 100644 docs/processors/getting-started.md diff --git a/docs/Deploy/_category_.json b/docs/Deploy/_category_.json deleted file mode 100644 index b4f78148..00000000 --- a/docs/Deploy/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Deploy" - } - \ No newline at end of file diff --git a/docs/configuration/_category_.json b/docs/configuration/_category_.json new file mode 100644 index 00000000..da011bfa --- /dev/null +++ b/docs/configuration/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Configuration", + "position": 1 +} diff --git a/docs/configuration/pipeline-configuration-files.mdx b/docs/configuration/pipeline-configuration-files.mdx new file mode 100644 index 00000000..63d46597 --- /dev/null +++ b/docs/configuration/pipeline-configuration-files.mdx @@ -0,0 +1,6 @@ +--- +title: 'Pipeline Configuration Files' +slug: 'pipeline-configuration-files' +--- + +TBD \ No newline at end of file diff --git a/docs/connectors/_category_.json b/docs/connectors/_category_.json new file mode 100644 index 00000000..e70a4af1 --- /dev/null +++ b/docs/connectors/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Connectors", + "position": 4 +} diff --git a/docs/connectors/behavior.mdx b/docs/connectors/behavior.mdx new file mode 100644 index 00000000..d8d7831e --- /dev/null +++ b/docs/connectors/behavior.mdx @@ -0,0 +1,7 @@ +--- +title: "Connector Behavior" +sidebar_label: "Behavior" +slug: "behavior" +--- + +TBD \ No newline at end of file diff --git a/docs/connectors/installing.mdx b/docs/connectors/installing.mdx new file mode 100644 index 00000000..2589c0ce --- /dev/null +++ b/docs/connectors/installing.mdx @@ -0,0 +1,6 @@ +--- +title: "Installing Connectors" +slug: "installing-connectors" +--- + +TBD \ No newline at end of file diff --git a/docs/connectors/opencdc.mdx b/docs/connectors/opencdc.mdx new file mode 100644 index 00000000..291e730e --- /dev/null +++ b/docs/connectors/opencdc.mdx @@ -0,0 +1,6 @@ +--- +title: "OpenCDC" +slug: "opencdc" +--- + +TBD \ No newline at end of file diff --git a/docs/deploy/_category_.json b/docs/deploy/_category_.json new file mode 100644 index 00000000..b1344a6d --- /dev/null +++ b/docs/deploy/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Deploy", + "position": 3 +} diff --git a/docs/Deploy/aws_ec2.mdx b/docs/deploy/aws_ec2.mdx similarity index 100% rename from docs/Deploy/aws_ec2.mdx rename to docs/deploy/aws_ec2.mdx diff --git a/docs/Deploy/overview.mdx b/docs/deploy/overview.mdx similarity index 100% rename from docs/Deploy/overview.mdx rename to docs/deploy/overview.mdx diff --git a/docs/introduction/architecture.md b/docs/introduction/architecture.mdx similarity index 90% rename from docs/introduction/architecture.md rename to docs/introduction/architecture.mdx index b2e6cd75..69340fea 100644 --- a/docs/introduction/architecture.md +++ b/docs/introduction/architecture.mdx @@ -10,10 +10,9 @@ Here is an overview of the Conduit Architecture. Conduit is split in the following layers: * **API layer** - exposes the public APIs used to communicate with Conduit. It exposes 2 types of APIs: * **gRPC** - this is the main API provided by Conduit. The gRPC API definition can be found in - [api.proto](../proto/api/v1/api.proto), it can be used to generate code for the client. + [api.proto](https://github.com/ConduitIO/conduit/blob/main/proto/api/v1/api.proto), it can be used to generate code for the client. * **HTTP** - the HTTP API is generated using [grpc-gateway](https://github.com/grpc-ecosystem/grpc-gateway) and - forwards the requests to the gRPC API. Conduit exposes an - [openapi](../pkg/web/openapi/swagger-ui/api/v1/api.swagger.json) definition that describes the HTTP API, which is + forwards the requests to the gRPC API. Conduit exposes an openapi definition that describes the HTTP API, which is also exposed through Swagger UI on `http://localhost:8080/openapi/`. * **Orchestration layer** - the orchestration layer is responsible for coordinating the flow of operations between the core services. It also takes care of transactions, making sure that changes made to specific entities are not visible @@ -52,7 +51,7 @@ Conduit is split in the following layers: * **Plugins** - while this is not a layer in the same sense as the other layers, it is a component separate from everything else. It interfaces with the connector on one side and with Conduit plugins on the other and facilitates the communication between them. A Conduit plugin is a separate process that implements the interface defined in - [plugins.proto](https://github.com/ConduitIO/conduit/blob/main/pkg/plugins/proto/plugins.proto) and provides the + [conduit-connector-protocol](https://github.com/ConduitIO/conduit-connector-protocol) and provides the read/write functionality for a specific resource (e.g. a database). - For more see [GitHub](https://github.com/ConduitIO/conduit/blob/main/docs/architecture.md). \ No newline at end of file +For more see [GitHub](https://github.com/ConduitIO/conduit/blob/main/docs/architecture.md). \ No newline at end of file diff --git a/docs/introduction/glossary.md b/docs/introduction/glossary.mdx similarity index 100% rename from docs/introduction/glossary.md rename to docs/introduction/glossary.mdx diff --git a/docs/introduction/getting-started.mdx b/docs/introduction/index.mdx similarity index 87% rename from docs/introduction/getting-started.mdx rename to docs/introduction/index.mdx index 8fb9b100..8ed786f3 100644 --- a/docs/introduction/getting-started.mdx +++ b/docs/introduction/index.mdx @@ -1,9 +1,6 @@ --- title: 'Getting Started with Conduit' -sidebar_position: 0 -slug: 'getting-started' hide_title: true -sidebar_label: "Getting Started" ---
@@ -23,7 +20,7 @@ To get started: If you’re on Mac, it will look something like this: ```shell -tar zxvf conduit_0.1.0_Darwin_x86_64.tar.gz +tar zxvf conduit_0.3.0_Darwin_x86_64.tar.gz ``` 3. Start Conduit: @@ -34,7 +31,7 @@ tar zxvf conduit_0.1.0_Darwin_x86_64.tar.gz **Tip**: Depending on your operating system, you may need to run `chmod +x conduit` before running the binary. -4. Navigate to `http://localhost:8080/ui/`: +4. Navigate to `http://localhost:8080`: ![Conduit Pipeline](/images/conduit/pipeline.png) diff --git a/docs/introduction/known-limitations.mdx b/docs/introduction/known-limitations.mdx index 06260995..f88f261f 100644 --- a/docs/introduction/known-limitations.mdx +++ b/docs/introduction/known-limitations.mdx @@ -3,13 +3,7 @@ title: 'Known limitations' slug: 'limitations' --- -While Conduit is built on strong foundations and experiences from running similar systems, it's not production ready -at the moment. Following features are on the roadmap and yet to be implemented: - -1. Standard record format - we plan to have the records implement a single standard for CDC events. -2. Delivery and ordering guarantees - from the experience we have so far, messages created internally are reliably delivered through - Conduit (from source nodes, over processing nodes to destination nodes). However, we still need good end-to-end, full-scale - tests to actually prove that. -3. Performance guarantees (for the core) - reasons are identical to reasons for delivery guarantees. -4. Dynamic loading of list of plugins - currently, the API cannot return the list of all available plugins and the available - configuration parameters. Consequently, the UI has the plugin paths and configuration parameters hard-coded. +Conduit is currently in a pre-1.0 state. While Conduit is built on strong +foundations and experiences from running similar systems, we don't recommend +relying on it in a production environment. It may break and end up in an +unexpected state, or in the worst case, result in lost data. \ No newline at end of file diff --git a/docs/introduction/plugins.mdx b/docs/introduction/plugins.mdx index 66c6203b..6f38bce0 100644 --- a/docs/introduction/plugins.mdx +++ b/docs/introduction/plugins.mdx @@ -7,9 +7,11 @@ sidebar_position: 4 A plugin is an external process which communicates with Conduit and knows how to read/write records from/to a data source/destination (e.g. a database). When thinking about plugins for Conduit, our goals were to: -- provide a good development experience to plugin developers. +- provide a good development experience to plugin developers. - ship Conduit with real built-in plugins (compiled into the Conduit binary), -- to make it as easy as possible to write plugins in _any_ programming language +- to make it as easy as possible to write plugins in _any_ programming language - the Plugin SDK to be decoupled from Conduit and be able to change without changing Conduit itself. -As we work on "How to build a plugin" documentation, you may view the [Conduit Plugin Architecture](https://github.com/ConduitIO/conduit/blob/main/docs/architecture-decision-records/20220121-conduit-plugin-architecture.md). \ No newline at end of file +Have a look at +the [Connector SDK](https://github.com/conduitio/conduit-connector-sdk) to find +out how to build a Conduit connector written in Go. \ No newline at end of file diff --git a/docs/introduction/what-is-conduit.mdx b/docs/introduction/what-is-conduit.mdx index c64a5547..3298864d 100644 --- a/docs/introduction/what-is-conduit.mdx +++ b/docs/introduction/what-is-conduit.mdx @@ -12,26 +12,34 @@ Out of the box, Conduit comes with: - a UI - common connectors -- transforms +- processors - observability The 🔋 are included. ### It’s simple, yet powerful -Eliminate the multi-step process you go through today. Just download the binary and start building. Conduit pipelines run in their own goroutines and are connected using Go channels. This makes Conduit pipelines incredibly performant on multi-core machines. +Eliminate the multi-step process you go through today. Just download the binary +and start building. Conduit pipelines run in their own goroutines and are +connected using Go channels. This makes Conduit pipelines incredibly performant +on multi-core machines. ### It’s real-time -Conduit pipelines listen for changes to a database, data warehouse, etc., and allows your data applications to act upon those changes in real-time. +Conduit pipelines listen for changes to a database, data warehouse, etc., and +allows your data applications to act upon those changes in real-time. ### It’s flexible -Conduit pipelines listen for changes to a database, data warehouse, etc., and allows your data applications to act upon those changes in real-time. +Conduit pipelines listen for changes to a database, data warehouse, etc., and +allows your data applications to act upon those changes in real-time. ### It’s Extensible -Conduit connectors are plugins that communicate with Conduit via a gRPC interface. This means that plugins can be written in any language as long as they conform to the required interface. More in-depth documentation regarding plugins is in progress! +Conduit connectors are plugins that communicate with Conduit via a gRPC +interface. This means that plugins can be written in any language as long as +they conform to the required interface. More in-depth documentation regarding +plugins is in progress! ## Get Involved diff --git a/docs/processors/_category_.json b/docs/processors/_category_.json new file mode 100644 index 00000000..adb77b1f --- /dev/null +++ b/docs/processors/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Processors", + "position": 2 +} diff --git a/docs/processors/getting-started.md b/docs/processors/getting-started.md new file mode 100644 index 00000000..e4f7e226 --- /dev/null +++ b/docs/processors/getting-started.md @@ -0,0 +1,7 @@ +--- +title: 'Getting Started with Processors' +slug: 'getting-started' +sidebar_label: "Getting Started" +--- + +TBD \ No newline at end of file diff --git a/docusaurus.config.js b/docusaurus.config.js index 998de7a5..02c68ced 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -8,8 +8,8 @@ module.exports = { url: 'https://conduit.io', // Url to your site with no trailing slash baseUrl: '/', onBrokenLinks: 'ignore', - onBrokenMarkdownLinks: 'error', - onDuplicateRoutes: 'error', + onBrokenMarkdownLinks: 'warn', + onDuplicateRoutes: 'warn', favicon: 'images/favicon.ico', scripts: [ { @@ -45,14 +45,13 @@ module.exports = { logo: { alt: 'Conduit Logo', src: 'images/conduit/on-white-conduit-logo.png', - href: 'https://conduit.io', srcDark: 'images/conduit-logo-dark.svg', }, items: [ { to: '/', label: 'Home', position: 'left', activeBaseRegex: `///` }, { type: 'doc', - docId: 'introduction/getting-started', + docId: 'introduction/index', position: 'left', label: 'Documentation', }, diff --git a/src/sidebars/sidebars.js b/src/sidebars/sidebars.js index 4a192617..8a547344 100644 --- a/src/sidebars/sidebars.js +++ b/src/sidebars/sidebars.js @@ -14,7 +14,7 @@ module.exports = { tutorialSidebar: [ {type: 'autogenerated', dirName: '.'},{ type: 'link', - label: 'Connectors', // The link label + label: 'Connector List', // The link label href: 'https://github.com/ConduitIO/conduit/blob/main/docs/connectors.md', // The external URL }, ], diff --git a/src/theme/HomeLayout/NavBar.tsx b/src/theme/HomeLayout/NavBar.tsx index 0f5b443d..b88c3eab 100644 --- a/src/theme/HomeLayout/NavBar.tsx +++ b/src/theme/HomeLayout/NavBar.tsx @@ -31,7 +31,7 @@ const navLinks = [ }, { label: 'Documentation', - href: '/docs/introduction/getting-started', + href: '/docs/introduction', }, { label: 'GitHub', From 661990ea6384a6867637848be520b1fc9e773a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Thu, 20 Oct 2022 16:24:36 +0200 Subject: [PATCH 2/8] add more docs --- .../pipeline-configuration-files.mdx | 92 ++++++++- docs/connectors/behavior.mdx | 16 +- docs/connectors/installing.mdx | 38 +++- docs/connectors/opencdc.mdx | 6 - .../{index.mdx => getting-started.mdx} | 3 + docs/introduction/known-limitations.mdx | 9 - docs/introduction/pipeline-semantics.mdx | 186 ++++++++++++++++++ docs/processors/getting-started.md | 164 ++++++++++++++- docusaurus.config.js | 2 +- src/theme/HomeLayout/NavBar.tsx | 2 +- static/images/pipeline_example.svg | 12 ++ 11 files changed, 509 insertions(+), 21 deletions(-) delete mode 100644 docs/connectors/opencdc.mdx rename docs/introduction/{index.mdx => getting-started.mdx} (94%) delete mode 100644 docs/introduction/known-limitations.mdx create mode 100644 docs/introduction/pipeline-semantics.mdx create mode 100644 static/images/pipeline_example.svg diff --git a/docs/configuration/pipeline-configuration-files.mdx b/docs/configuration/pipeline-configuration-files.mdx index 63d46597..6e44f52c 100644 --- a/docs/configuration/pipeline-configuration-files.mdx +++ b/docs/configuration/pipeline-configuration-files.mdx @@ -3,4 +3,94 @@ title: 'Pipeline Configuration Files' slug: 'pipeline-configuration-files' --- -TBD \ No newline at end of file +Pipeline configuration files give you the ability to define pipelines that are provisioned by Conduit at startup. +It's as simple as creating a YAML file that defines pipelines, connectors, processors, and their corresponding configurations. + +## Getting started +Create a folder called `pipelines` at the same level as your Conduit binary file, add all your YAML files +there, then run Conduit using the command: +``` +./conduit +``` +Conduit will only search for files with `.yml` or `.yaml` extensions, recursively in all sub-folders. + +If you have your YAML files in a different directory, or want to provision only one file, then simply run Conduit with +the CLI flag `pipelines.path` and point to your file or directory: +``` +./conduit -pipeline.path ../my-directory +``` +If your directory does not exist, Conduit will fail with an error: `"pipelines.path" config value is invalid` + +### YAML Schema +The file in general has two root keys, the `version`, and the `pipelines` map. The map consists of other elements like +`status` and `name`, which are configurations for the pipeline itself. + +To create connectors in that pipeline, simply add another map under the pipeline map, and call it `connectors`. + +To create processors, either add a `processors` map under a pipeline ID, or under a connector ID, depending on its parent. +Check this YAML file example with explanation for each field: + +``` yaml +version: 1.0 # parser version, the only supported version for now is 1.0 [mandatory] + +pipelines: # a map of pipelines IDs and their configurations. + pipeline1: # pipeline ID, has to be unique. + status: running # pipelines status at startup, either running or stopped. [mandatory] + name: pipeline1 # pipeline name, if not specified, pipeline ID will be used as name. [optional] + description: desc # pipeline description. [optional] + connectors: # a map of connectors IDs and their configurations. + con1: # connector ID, has to be unique per pipeline. + type: source # connector type, either "source" or "destination". [mandatory] + plugin: builtin:file # connector plugin. [mandatory] + name: con3 # connector name, if not specified, connector ID will be used as name. [optional] + settings: # map of configurations keys and their values. + path: ./file1.txt # for this example, the plugin "bultin:file" has only one configuration, which is path. + con2: + type: destination + plugin: builtin:file + name: file-dest + settings: + path: ./file2.txt + processors: # a map of processor IDs and their configurations, "con2" is the processor parent. + proc1: # processor ID, has to be unique for each parent + type: js # processor type. [mandatory] + settings: # map of processor configurations and values + Prop1: string + processors: # processor IDs, that have the pipeline "pipeline1" as a parent. + proc2: + type: js + settings: + prop1: ${ENV_VAR} # yon can use environmental variables by wrapping them in a dollar sign and curly braces ${}. +``` + +If the file is invalid (missed a mandatory field, or has an invalid configuration value), then the pipeline that has the +invalid value will be skipped, with an error message logged. + +If two pipelines in one file have the same ID, or the `version` field was not specified, then the file would be +non-parsable and will be skipped with an error message logged. + +If two pipelines from different files have the same ID, the second pipeline will be skipped, with an error message +specifying which pipeline was not provisioned. + +**_Note_**: Connector IDs and processor IDs will get their parent ID prefixed, so if you specify a connector ID as `con1` +and its parent is `pipeline1`, then the provisioned connector will have the ID `pipeline1:con1`. Same goes for processors, +if the processor has a pipeline parent, then the processor ID will be `connectorID:processorID`, and if a processor +has a connector parent, then the processor ID will be `pipelineID:connectorID:processorID`. + +## Pipelines Immutability +Pipelines provisioned by configuration files are **immutable**, any updates needed on a provisioned pipeline have to be +done through the configuration file it was provisioned from. You can only control stopping and starting a pipeline +through the UI or API. + +### Updates and Deletes +Updates and deletes for a pipeline provisioned by configuration files can only be done through the configuration files. +Changes should be made to the files, then Conduit has to be restarted to reload the changes. Any updates or deletes done +through the API or UI will be prohibited. + +* To delete a pipeline: simply delete it from the `pipelines` map from the configuration file, then run conduit again. +* To update a pipeline: change any field value from the configuration file, and run conduit again to address these updates. + +Updates will preserve the status of the pipeline, and will continue working from where it stopped. However, the pipeline +will start from the beginning of the source and will not continue from where it stopped, if one of these values were updated: +{`pipeline ID`, `connector ID`, `connector plugin`, `connector type`}. + diff --git a/docs/connectors/behavior.mdx b/docs/connectors/behavior.mdx index d8d7831e..ee1ae9fc 100644 --- a/docs/connectors/behavior.mdx +++ b/docs/connectors/behavior.mdx @@ -4,4 +4,18 @@ sidebar_label: "Behavior" slug: "behavior" --- -TBD \ No newline at end of file +This document provides insights on how Conduit communicates with a connector, +what is expected from the connector and the guarantees that Conduit provides. + +## Conduit Connector Protocol + +Conduit expects all connectors to follow the +[Conduit Connector Protocol](https://github.com/ConduitIO/conduit-connector-protocol) +. The connector protocol is a protobuf file describing the interface between +Conduit and the connector in the form of gRPC services. This approach allows +connectors to be written in any language with support for gRPC. + +The connector protocol splits the connector interface in 3 gRPC services - one +for the source, another for the destination, and a third one for the connector +specifications. A connector needs to implement the specifications and at least +the source or destination. diff --git a/docs/connectors/installing.mdx b/docs/connectors/installing.mdx index 2589c0ce..bd79ae9f 100644 --- a/docs/connectors/installing.mdx +++ b/docs/connectors/installing.mdx @@ -3,4 +3,40 @@ title: "Installing Connectors" slug: "installing-connectors" --- -TBD \ No newline at end of file +Conduit ships with a number of built-in connectors: + +- [File connector](https://github.com/ConduitIO/conduit-connector-file) provides + a source/destination to read/write a local file (useful for quickly trying out + Conduit without additional setup). +- [Kafka connector](https://github.com/ConduitIO/conduit-connector-kafka) + provides a source/destination for Apache Kafka. +- [Postgres connector](https://github.com/ConduitIO/conduit-connector-postgres) + provides a source/destination for PostgreSQL. +- [S3 connector](https://github.com/ConduitIO/conduit-connector-s3) provides a + source/destination for AWS S3. +- [Generator connector](https://github.com/ConduitIO/conduit-connector-generator) + provides a source which generates random data (useful for testing). + +Besides these connectors there is a number of standalone connectors that can be +added to Conduit as plugins (find the complete +list [here](https://github.com/ConduitIO/conduit/blob/main/docs/connectors.md)). + +### Standalone Connector Binary + +To install a standalone connector you first need the compiled connector binary. +A binary can normally be downloaded from the latest release in the connector's +GitHub repository (this may vary in 3rd party connectors not developed by the +Conduit team). Make sure to download the binary that matches your operating +system and architecture. + +Alternatively you can build the binary yourself (for instructions on building a +connector please refer to the readme of that specific connector). + +### Installing the Connector in Conduit + +Conduit loads standalone connectors at startup. By default, it looks for +binaries in the folder `connectors` relative to the Conduit binary (you can +adjust this with the CLI flag `-connectors.path`). + +To install a connector you need to place the connector binary in the connectors +folder and restart Conduit. diff --git a/docs/connectors/opencdc.mdx b/docs/connectors/opencdc.mdx deleted file mode 100644 index 291e730e..00000000 --- a/docs/connectors/opencdc.mdx +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "OpenCDC" -slug: "opencdc" ---- - -TBD \ No newline at end of file diff --git a/docs/introduction/index.mdx b/docs/introduction/getting-started.mdx similarity index 94% rename from docs/introduction/index.mdx rename to docs/introduction/getting-started.mdx index 8ed786f3..0952e039 100644 --- a/docs/introduction/index.mdx +++ b/docs/introduction/getting-started.mdx @@ -1,6 +1,9 @@ --- title: 'Getting Started with Conduit' +sidebar_position: 0 +slug: 'getting-started' hide_title: true +sidebar_label: "Getting Started" ---
diff --git a/docs/introduction/known-limitations.mdx b/docs/introduction/known-limitations.mdx deleted file mode 100644 index f88f261f..00000000 --- a/docs/introduction/known-limitations.mdx +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: 'Known limitations' -slug: 'limitations' ---- - -Conduit is currently in a pre-1.0 state. While Conduit is built on strong -foundations and experiences from running similar systems, we don't recommend -relying on it in a production environment. It may break and end up in an -unexpected state, or in the worst case, result in lost data. \ No newline at end of file diff --git a/docs/introduction/pipeline-semantics.mdx b/docs/introduction/pipeline-semantics.mdx new file mode 100644 index 00000000..3cbb5670 --- /dev/null +++ b/docs/introduction/pipeline-semantics.mdx @@ -0,0 +1,186 @@ +--- +title: "Pipeline Semantics" +sidebar_position: 6 +slug: "pipeline-semantics" +--- + +This document describes the inner workings of a Conduit pipeline, its structure, and behavior. It also describes a +Conduit message and its lifecycle as it flows through the pipeline. + +**NOTE**: Some parts of this document describe behavior that is not yet fully implemented (e.g. DLQs). +For more information see [#383](https://github.com/ConduitIO/conduit/pull/383). This note +should be removed once the new behavior is implemented. + +## Pipeline structure + +A Conduit pipeline is a directed acyclic graph of nodes. Each node runs in its own goroutine and is connected to other +nodes via unbuffered Go channels that can transmit messages. In theory, we could create arbitrarily complex graphs of +nodes, but for the sake of a simpler API we expose the ability to create graphs with the following structure: + +![Pipeline](/images/pipeline_example.svg) + +In the diagram above we see 7 sections: + +- **Source connectors** - represents the code that communicates with a 3rd party system and continuously fetches records +and sends them to Conduit (e.g. [kafka connector](https://github.com/conduitio/conduit-connector-kafka)). Every source +connector is managed by a source node that receives records, wraps them in a message, and sends them downstream to the +next node. A pipeline requires at least one source connector to be runnable. +- **Source processors** - these processors only receive messages originating at a specific source connector. Source +processors are created by specifying the corresponding source connector as the parent entity. Source processors are not +required for starting a pipeline. +- **Fan-in node** - this node is essentially a multiplexer that receives messages produced by all source connectors and +sends them into one output channel. The order of messages coming from different connectors is nondeterministic. A +fan-in node is automatically created for all pipelines. +- **Pipeline processors** - these processors receive all messages that flow through the pipeline, regardless of the +source or destination. Pipeline processors are created by specifying the pipeline as the parent entity. Pipeline processors +are not required for starting a pipeline. +- **Fan-out node** - this node is the counterpart to the fan-in node and acts as a demultiplexer that sends messages +coming from a single input to multiple downstream nodes (one for each destination). The fan-out node does not buffer +messages, instead, it waits for a message to be sent to all downstream nodes before processing the next message (see +[backpressure](#backpressure)). A fan-out node is automatically created for all pipelines. +- **Destination processors** - these processors receive only messages that are meant to be sent to a specific +destination connector. Destination processors are created by specifying the corresponding destination connector as the +parent entity. Destination processors are not required for starting a pipeline. +- **Destination connectors** - represents the code that communicates with a 3rd party system and continuously receives +records from Conduit and writes them to the destination (e.g. +[kafka connector](https://github.com/conduitio/conduit-connector-kafka)). Every destination connector is managed by a +destination node that receives records and sends them to the connector. A pipeline requires at least one destination +connector to be runnable. + +There are additional internal nodes that Conduit adds to each pipeline not shown in the diagram, as they are +inconsequential for the purpose of this document (e.g. nodes for gathering metrics, nodes for managing acknowledgments, +etc.). + +## Message + +A message is a wrapper around a record that manages the record's lifecycle as it flows through the pipeline. Messages +are created in source nodes when they receive records from the source connector, and they are passed down the pipeline +between nodes until they are acked or nacked. Nodes are only allowed to hold a reference to a single message at a time, +meaning that they need to pass the message to the next node before taking another message¹. This also means there is no +explicit buffer in Conduit, a pipeline can only hold only as many messages as there are nodes in the pipeline (see +[backpressure](#backpressure) for more information). + +¹This might change in the future if we decide to add support for multi-message transforms. + +### Message states + +A message can be in one of 3 states: + +- **Open** - all messages start in the open state. This means the message is currently in processing, either by a node +or a destination connector. A pipeline won't stop until all messages transition from the open state into one of the +other two states. +- **Acked** - a message was successfully processed and acknowledged. This can be done either by a processor (e.g. it +filtered the message out) or by a destination. If a pipeline contains multiple destinations, the message needs to be +acknowledged by all destinations before it is marked as acked. Acks are propagated back to the source connector and +can be used to advance the position in the source system if applicable. +- **Nacked** - the processing of the message failed and resulted in an error, so the message was negatively +acknowledged. This can be done either by a processor (e.g. a transform failed) or by a destination. If a pipeline +contains multiple destinations, the message needs to be negatively acknowledged by at least one destination before it +is marked as nacked. When a message is nacked, the message is passed to the [DLQ](#dead-letter-queue) handler, which +essentially controls what happens after a message is nacked (stop pipeline, drop message and continue running or store +message in DLQ and continue running). + +**Important**: if a message gets nacked and the DLQ handler successfully processes the nack (e.g. stores the message in +the dead letter queue), the source connector will receive an ack as if the message was successfully processed, even +though Conduit marks it internally as nacked. In other words, the source connector will receive an ack every time +Conduit handled a message end-to-end and it can be safely discarded from the source. + +Pipeline nodes will either leave the message open and send it to the next node for processing or ack/nack it and not +send it further down the pipeline. If the ack/nack fails, the node will stop running and return an error that will +consequently stop the whole pipeline. The returned error is stored in the pipeline for further inspection by the user. + +### Message state change handlers + +A pipeline node can register state change handlers on a message that will be called when the message state changes. This +is used for example to register handlers that reroute nacked messages to a dead letter queue or to update metrics when a +message reaches the end of the pipeline. If a message state change handler returns an error, the node that triggered the +ack/nack will stop running, essentially causing the whole pipeline to stop. + +## Semantics + +### Messages are delivered in order + +Since messages are passed between nodes in channels and a node only processes one message at a time, it is guaranteed +that messages from a single source connector will flow through the Conduit pipeline in the same order that was produced +by that source. + +There are two caveats: + +- If a pipeline contains multiple source connectors, the order of two messages coming from different connectors is +nondeterministic. Messages coming from the same source connector are still guaranteed to retain their order. +- If a dead letter queue is configured, negatively acknowledged messages will be removed from the stream while the +pipeline will keep running, thus impacting the order of messages. + +The order guarantee only holds inside of Conduit. Once a message reaches a destination connector, it is allowed to buffer +messages and batch write them to 3rd party systems. Normally the connector would retain the order, although we can't +vouch for badly written connectors that don't follow this behavior. + +### Messages are delivered at least once + +Between pipeline restarts, it is guaranteed that any message that is processed successfully by all nodes and not +filtered out will be delivered to a destination connector at least once. Multiple deliveries can occur in pipelines with +multiple destinations that stopped because of a negatively acknowledged record, or pipelines where a destination +negatively acknowledged a record and processed more messages after that. For this reason, we strongly recommend +implementing the write operation of a destination connector in an idempotent way (if possible). + +The delivery guarantee can be changed to "at most once" by adding a [dead letter queue](#dead-letter-queue) handler that +drops unsuccessfully processed messages. + +### Acks are delivered in order + +Conduit ensures that acknowledgments are sent to the source connector in the exact same order as records produced by the +connector. This guarantee still holds, even if a badly implemented destination connector acknowledges records in a +different order, or if a processor filters out a record (i.e. acks the message) while a message that came before it is +still being processed. + +### Acks are delivered at most once + +Acknowledgments are sent back to the source connector at most once. This means that if a message gets negatively +acknowledged and is not successfully processed by a DLQ handler, the acknowledgment for that message won't be delivered +to the source connector. Acknowledgments of all messages produced after this message also won't be delivered to the +source connector, otherwise the order delivery guarantee would be violated. The absence of an acknowledgment after the +source connector teardown is initiated can be interpreted as a negative acknowledgment. + +### Backpressure + +The usage of unbuffered channels between nodes and the absence of explicit buffers results in backpressure. This means +that the speed of the destination connector dictates the speed of the whole pipeline. + +There is an implicit buffer that needs to be filled up before backpressure takes effect. The buffer is equal to the +number of nodes in a pipeline - similar to how a longer garden hose holds more water, a longer Conduit pipeline can hold +more messages. There are two exceptions to this rule: + +- Conduit is using gRPC streams to communicate with standalone connectors, which internally buffers requests before +sending them over the wire, thus creating another implicit buffer (we are aware of this +issue: [#211](https://github.com/ConduitIO/conduit/issues/211)). +- Destination connectors are allowed to collect multiple records and write them in batches to the destination, which +creates a buffer that depends on the connector implementation. + +If there are multiple destinations the fanout node won't fetch a new message from the upstream node until the current +message was successfully sent to all downstream nodes (this doesn't mean the message was necessarily processed, just +that all downstream nodes received the message). As a consequence, the speed of the pipeline will be throttled to +accommodate the abilities of the slowest destination connector. + +### Dead letter queue + +Messages that get negatively acknowledged can be rerouted to another destination called a dead letter queue (DLQ) where +they are stored and can be reprocessed at a later point in time after manual intervention. If rerouting is set up and +the message successfully reaches the DLQ the message will be internally nacked, but an acknowledgment will be sent to +the source connector since Conduit handled the message and it can be discarded from the source. The user has the option +to configure a DLQ that simply logs a warning and drops messages to achieve "at most once" delivery guarantees. + +### Pipeline stop + +A pipeline can be stopped in two ways - either it's stopped gracefully or forcefully. + +- A graceful stop is initiated either by Conduit shutting down or by the user requesting the pipeline to stop. Only the +source connector nodes will receive the signal to stop running. The source nodes will stop running and close their +outgoing channels, notifying the downstream nodes that there will be no more messages. This behavior propagates down +the pipeline until the last node stops running. Any messages that were being processed while the pipeline received a +stop signal will be processed normally and written to all destinations. +- A forceful stop is initiated when a node stops running because it experienced an unrecoverable error (e.g. it nacked a +message and received an error because no DLQ is configured, or the connector plugin returned an unexpected error). In +that case, the context that is shared by all nodes will get canceled, signaling to all nodes simultaneously that they +should stop running as soon as possible. Messages that are in the pipeline won't be drained, instead, they are dropped +and will be requested from the source again once the pipeline is restarted. The error returned from the first node +that failed will be stored in the pipeline and can be retrieved through the API. \ No newline at end of file diff --git a/docs/processors/getting-started.md b/docs/processors/getting-started.md index e4f7e226..56f2abf9 100644 --- a/docs/processors/getting-started.md +++ b/docs/processors/getting-started.md @@ -4,4 +4,166 @@ slug: 'getting-started' sidebar_label: "Getting Started" --- -TBD \ No newline at end of file +A processor is a component that operates on a single record that flows through a pipeline. It can either change the record +(i.e. **transform** it) or **filter** it out based on some criteria. Since they are part of pipelines, making yourself +familiar with [pipeline semantics](/docs/introduction/pipeline-semantics) is highly recommended. + +![Pipeline](/images/pipeline_example.svg) + +Processors are **optional** components in a pipeline, i.e. a pipeline can be started without them. They are always attached +to a single parent, which can be either a connector or a pipeline. With that, we can say that we have the following types +of processors: +1. **Source processors**: these processors only receive messages originating at a specific source connector. Source + processors are created by specifying the corresponding source connector as the parent entity. +2. **Pipeline processors**: these processors receive all messages that flow through the pipeline, regardless of the + source or destination. Pipeline processors are created by specifying the pipeline as the parent entity. +3. **Destination processors**: these processors receive only messages that are meant to be sent to a specific + destination connector. Destination processors are created by specifying the corresponding destination connector as the + parent entity. + +Given that every processor can have one (and only one) parent, processors cannot be shared. In case the same processing +needs to happen for different sources or destinations, you have two options: +1. If records from all sources (or all destinations) need to be processed in the same way, then you can create + a pipeline processor +2. If records from some, but not all, sources (or destinations) need to be processed in the same way, then you need to + create multiple processors (one for each source or destination) and configure them in the same way. + +## Adding and configuring a processor + +Processors are created through the `/processors` endpoint. Here's an example: + +```json lines +POST /v1/processors +{ + // name of the processor in Conduit + // note that this is NOT a user-defined name for this processor + "name": "extractfieldpayload", + "parent": + { + // type of parent: TYPE_CONNECTOR or TYPE_PIPELINE + "type": "TYPE_CONNECTOR", + // parent ID (connector ID in this case) + "id": "aed07589-44d8-4c68-968c-1f6c5197f13b" + }, + "config": + { + "settings": + { + // configuration map for this processor + "field": "name" + } + } +} +``` +The request to create a processor is described in [api.swagger.json](https://github.com/ConduitIO/conduit/blob/main/pkg/web/openapi/swagger-ui/api/v1/api.swagger.json). + +## Supported processors + +Conduit provides a number of built-in processors, such as filtering fields, replacing them, posting payloads to HTTP endpoints etc. +Conduit also provides the ability to write custom processors in JavaScript. + +### Built-in processors + +An up-to-date list of all built-in processors and detailed descriptions can be found [here](https://pkg.go.dev/github.com/conduitio/conduit/pkg/processor/procbuiltin). +Note that all built-in processors that operate on the payload actually operate on `Record.Payload.After`. If you need to manipulate +the field `Record.Payload.Before` you can use a [JavaScript processor](#javascript-processors). + +An example is available in [`extract-field-transform.sh`](https://github.com/ConduitIO/conduit/blob/main/examples/processors/extract-field-transform.sh). The script will +set up a pipeline with the built-in extract-field processors. + +### JavaScript processors + +JavaScript processors make it possible to write custom processors in JavaScript. The API name for JavaScript processors +(used in the request to create a processor) is `js`. There's only one configuration parameter, `script`, which is the +script itself. To find out what's possible with the JS processors, also refer to the documentation for [goja](https://github.com/dop251/goja), +which is the JavaScript engine we use. + +Here's an example of a request payload to create a JavaScript processor: + +```json +{ + "name": "js", + "parent": { + "type": "TYPE_CONNECTOR", + "id": "d1ae72ea-9d9c-4bb2-b993-fdb7a01825ab" + }, + "config": { + "settings": { + "script": "function process(record) {\n record.Metadata[\"foo-key\"] = \"foo-value\";\n return record;\n}\n" + } + } +} +``` +The above will create a JavaScript processor (`"name": "js"`), attached to a connector (for the parent, we have +`"type": "TYPE_CONNECTOR"`). The script used is: +```javascript +function process(record) { + record.Metadata["foo-key"] = "foo-value"; + return record; +} +``` + +The script needs to define a function called `process`, which accepts an `sdk.Record`, and returns: +* an `sdk.Record`, in case you want to transform the record, +* `null`, in case you want to drop the record from the pipeline. + +The above example request transforms a record, by "enriching" its metadata (it adds a metadata key). Following is an +example where we also filter records: +```javascript +function process(r) { + // if the record metadata has a "keepme" key set + // we will keep it. + // otherwise we return null (i.e. we drop the record from the pipeline) + if (r.Metadata["keepme"] != undefined) { + return r + } + return null; +} +``` + +The script is not constrained to having only this function, i.e. you can have something like this: +```javascript +function doSomething(record) { + // do something with the record + return record +} + +function process(record) { + doSomething(record) + return record +} +``` + +Conduit also provides a number of helper objects and methods which can be used in the JS code. Those are, currently: +1. `logger` - a `zerolog.Logger` which writes to the Conduit server logs. You can use it in the same way you would use + it in Go code, i.e. you can write this for example: `logger.Info().Msgf("hello, %v!", "world")` +2. `Record()` - constructs a `record.Record`. +3. `RawData()` - constructs `record.RawData`. +4. `StructuredData()` - constructs `record.StructuredData`. + +Following is an example of a JavaScript processor, where we transform a record and utilize a number of tools mentioned +above: +```javascript +// Parses the record payload as JSON +function parseAsJSON(record) { + // we can use all of the JavaScript built-in functions + // we use the record as if we would use it Go code, + // so record.Payload.Bytes() gives us the payload bytes + return JSON.parse(String.fromCharCode.apply(String, record.Payload.Bytes())) +} + +function process(record) { + logger.Info().Msg("entering process"); + + let json = parseAsJSON(record); + json["greeting"] = "hello!"; + logger.Info().Msgf("json: %v", json); + + // we're creating a new RawData object, using a helper + record.Payload.After = new RawData(); + record.Payload.After.Raw = JSON.stringify(json); + + logger.Info().Msg("exiting process"); + return record; +} +``` \ No newline at end of file diff --git a/docusaurus.config.js b/docusaurus.config.js index 02c68ced..a0db53b2 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -51,7 +51,7 @@ module.exports = { { to: '/', label: 'Home', position: 'left', activeBaseRegex: `///` }, { type: 'doc', - docId: 'introduction/index', + docId: 'introduction/getting-started', position: 'left', label: 'Documentation', }, diff --git a/src/theme/HomeLayout/NavBar.tsx b/src/theme/HomeLayout/NavBar.tsx index b88c3eab..0f5b443d 100644 --- a/src/theme/HomeLayout/NavBar.tsx +++ b/src/theme/HomeLayout/NavBar.tsx @@ -31,7 +31,7 @@ const navLinks = [ }, { label: 'Documentation', - href: '/docs/introduction', + href: '/docs/introduction/getting-started', }, { label: 'GitHub', diff --git a/static/images/pipeline_example.svg b/static/images/pipeline_example.svg new file mode 100644 index 00000000..d27b44f7 --- /dev/null +++ b/static/images/pipeline_example.svg @@ -0,0 +1,12 @@ + + + + + + + SourceConnectors............Source ProcessorsFan-inNode...Pipeline ProcessorsFan-outNodeDestination ProcessorsDestinationConnectors............ \ No newline at end of file From 0d795072249709c39ce84000cb6b6d0ad0b0b22f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Mon, 24 Oct 2022 17:56:56 +0200 Subject: [PATCH 3/8] even more docs --- docs/connectors/behavior.mdx | 49 ++++++++++++++++++++++--- docs/connectors/building.mdx | 19 ++++++++++ docs/connectors/installing.mdx | 3 +- docs/introduction/connectors.mdx | 18 +++++++++ docs/introduction/getting-started.mdx | 25 ++++++++----- docs/introduction/glossary.mdx | 24 +++++++----- docs/introduction/plugins.mdx | 17 --------- docs/introduction/what-is-conduit.mdx | 19 ++++------ static/images/standalone-vs-builtin.svg | 1 + 9 files changed, 120 insertions(+), 55 deletions(-) create mode 100644 docs/connectors/building.mdx create mode 100644 docs/introduction/connectors.mdx delete mode 100644 docs/introduction/plugins.mdx create mode 100644 static/images/standalone-vs-builtin.svg diff --git a/docs/connectors/behavior.mdx b/docs/connectors/behavior.mdx index ee1ae9fc..022d6490 100644 --- a/docs/connectors/behavior.mdx +++ b/docs/connectors/behavior.mdx @@ -2,20 +2,57 @@ title: "Connector Behavior" sidebar_label: "Behavior" slug: "behavior" +sidebar_position: 1 --- -This document provides insights on how Conduit communicates with a connector, -what is expected from the connector and the guarantees that Conduit provides. +This document provides insights on how Conduit communicates with a connector. ## Conduit Connector Protocol Conduit expects all connectors to follow the -[Conduit Connector Protocol](https://github.com/ConduitIO/conduit-connector-protocol) -. The connector protocol is a protobuf file describing the interface between -Conduit and the connector in the form of gRPC services. This approach allows -connectors to be written in any language with support for gRPC. +[Conduit Connector Protocol](https://github.com/ConduitIO/conduit-connector-protocol). +The connector protocol is a set of protobuf files describing +the [interface](#protocol-grpc-interface) +between Conduit and the connector in the form of gRPC services. This approach +allows connectors to be written in any language with support for gRPC. The connector protocol splits the connector interface in 3 gRPC services - one for the source, another for the destination, and a third one for the connector specifications. A connector needs to implement the specifications and at least the source or destination. + +Note that you don't need to use the connector protocol directly - we provide a +[Go connector SDK](https://github.com/ConduitIO/conduit-connector-sdk) that +hides the complexity of the protocol and simplifies the implementation of a +connector. + +### Standalone vs built-in connectors + +While the Conduit Connector Protocol decouples Conduit from its connectors by +using gRPC, it also provides a thin Go layer that allows any Go connector to be +compiled into the Conduit binary as a built-in connector. The following diagram +shows how Conduit communicates with a standalone connector and a built-in +connector. + +![Standalone vs built-in connectors](/images/standalone-vs-builtin.svg) + +**Standalone connectors** are run as separate processes, separate from the +Conduit process. They need to have an entrypoint (binary or script) which runs +the connector and starts the gRPC server responsible for communicating with +Conduit. A standalone connector process is started and stopped by Conduit on +demand. One connector process will be started for every pipeline connector in +Conduit. + +**Built-in connectors** on the other hand are executed in the same process as +Conduit and communicate with Conduit through Go channels instead of gRPC. Any +connector written in Go can be compiled into the Conduit binary and used as a +built-in connector. + +Find ouy more about the [Conduit connector plugin architecture](https://github.com/ConduitIO/conduit/blob/main/docs/architecture-decision-records/20220121-conduit-plugin-architecture.md). + +## Protocol gRPC Interface + +The protocol interface is hosted on the +[Buf schema registry](https://buf.build/conduitio/conduit-connector-protocol/docs/main:connector.v1). +Use it as a starting point when implementing a connector in a language other +than Go. \ No newline at end of file diff --git a/docs/connectors/building.mdx b/docs/connectors/building.mdx new file mode 100644 index 00000000..c27beeab --- /dev/null +++ b/docs/connectors/building.mdx @@ -0,0 +1,19 @@ +--- +title: "Building Connectors" +slug: "building-connectors" +sidebar_position: 2 +--- + +Conduit connectors can be built in any programming language that supports gRPC. +To make it easier to write connectors we provide +a [Connector SDK](https://github.com/ConduitIO/conduit-connector-sdk) written in +Go. Using the SDK is the recommended way of writing a Conduit connector. + +## Conduit connector template + +The easiest way to start implementing your own Conduit connector is by using the +[Conduit connector template](https://github.com/ConduitIO/conduit-connector-template). +It contains the basic project structure as well as some additional utilities +like GitHub actions and a Makefile. + +Find out more about the template and how to use it in the readme. \ No newline at end of file diff --git a/docs/connectors/installing.mdx b/docs/connectors/installing.mdx index bd79ae9f..d6886586 100644 --- a/docs/connectors/installing.mdx +++ b/docs/connectors/installing.mdx @@ -1,6 +1,7 @@ --- title: "Installing Connectors" slug: "installing-connectors" +sidebar_position: 0 --- Conduit ships with a number of built-in connectors: @@ -32,7 +33,7 @@ system and architecture. Alternatively you can build the binary yourself (for instructions on building a connector please refer to the readme of that specific connector). -### Installing the Connector in Conduit +### Installing a Connector in Conduit Conduit loads standalone connectors at startup. By default, it looks for binaries in the folder `connectors` relative to the Conduit binary (you can diff --git a/docs/introduction/connectors.mdx b/docs/introduction/connectors.mdx new file mode 100644 index 00000000..48cbee8d --- /dev/null +++ b/docs/introduction/connectors.mdx @@ -0,0 +1,18 @@ +--- +title: 'Connectors' +slug: 'connectors' +sidebar_position: 4 +--- + +A connector is an external process which communicates with Conduit and knows how +to read/write records from/to a data source/destination (e.g. a database). + +When thinking about connectors for Conduit, our goals were to: +- provide a good development experience to connector developers, +- ship Conduit with real built-in connectors (compiled into the Conduit binary), +- to make it as easy as possible to write plugins in _any_ programming language, +- the [Connector SDK](https://github.com/conduitio/conduit-connector-sdk) to be + decoupled from Conduit and be able to change without changing Conduit itself. + +Have a look at our [connector docs](/docs/connectors/installing-connectors) to +find out more! \ No newline at end of file diff --git a/docs/introduction/getting-started.mdx b/docs/introduction/getting-started.mdx index 0952e039..cb91b276 100644 --- a/docs/introduction/getting-started.mdx +++ b/docs/introduction/getting-started.mdx @@ -6,21 +6,27 @@ hide_title: true sidebar_label: "Getting Started" --- -
- Conduit Logo -
+Conduit Logo +Conduit is a data integration tool for software engineers. Its purpose is to +help you move data from A to B. You can use Conduit to send data from Kafka to +Postgres, between files and APIs, +between [supported connectors](https://github.com/ConduitIO/conduit/blob/main/docs/connectors.md), +and [any datastore you can build a plugin for](/docs/introduction/plugins). -Conduit is a data integration tool for software engineers. Its purpose is to help you move data from A to B. You can use Conduit to send data from Kafka to Postgres, between files and APIs, between [supported connectors](https://github.com/ConduitIO/conduit/blob/main/docs/connectors.md), and [any datastore you can build a plugin for](/docs/introduction/plugins). - -It's written in [GoLang](https://go.dev/), compiles to a binary, and is designed to be easy to use and [deploy](https://docs.conduit.io/docs/Deploy/overview). +It's written in [GoLang](https://go.dev/), compiles to a binary, and is designed +to be easy to use and [deploy](https://docs.conduit.io/docs/Deploy/overview). To get started: -1. [Download the latest Conduit release](https://github.com/ConduitIO/conduit/releases). +1. [Download the latest Conduit release](https://github.com/ConduitIO/conduit/releases/latest). 2. Unzip: -If you’re on Mac, it will look something like this: +If you're on Mac, it will look something like this: ```shell tar zxvf conduit_0.3.0_Darwin_x86_64.tar.gz @@ -32,7 +38,8 @@ tar zxvf conduit_0.3.0_Darwin_x86_64.tar.gz ./conduit ``` -**Tip**: Depending on your operating system, you may need to run `chmod +x conduit` before running the binary. +**Tip**: Depending on your operating system, you may need to +run `chmod +x conduit` before running the binary. 4. Navigate to `http://localhost:8080`: diff --git a/docs/introduction/glossary.mdx b/docs/introduction/glossary.mdx index 4bec2264..73982fc8 100644 --- a/docs/introduction/glossary.mdx +++ b/docs/introduction/glossary.mdx @@ -3,13 +3,17 @@ title: "Glossary" slug: "glossary" --- - -* **Pipeline** - a pipeline receives records from one or multiple source connectors, pushes them through zero or - multiple processors until they reach one or multiple destination connectors. -* **Connector** - a connector is the internal entity that communicates with a connector plugin and either pushes records - from the plugin into the pipeline (source connector) or the other way around (destination connector). -* **Connector plugin** - sometimes also referred to as "plugin", is an external process which communicates with Conduit - and knows how to read/write records from/to a data source/destination (e.g. a database). -* **Processor** - a component that executes an operation on a single record that flows through the pipeline. It can - either change the record or filter it out based on some criteria. -* **Record** - a record represents a single piece of data that flows through a pipeline (e.g. one database row). \ No newline at end of file +* **Pipeline** - a pipeline receives records from one or multiple source + connectors, pushes them through zero or multiple processors until they reach + one or multiple destination connectors. +* **Connector** - a connector is the internal entity that communicates with a + connector plugin and either pushes records from the plugin into the pipeline + (source connector) or the other way around (destination connector). +* **Connector plugin** - sometimes also referred to as "plugin", is an external + process which communicates with Conduit and knows how to read/write records + from/to a data source/destination (e.g. a database). +* **Processor** - a component that executes an operation on a single record that + flows through the pipeline. It can either change the record or filter it out + based on some criteria. +* **Record** - a record represents a single piece of data that flows through a + pipeline (e.g. one database row). \ No newline at end of file diff --git a/docs/introduction/plugins.mdx b/docs/introduction/plugins.mdx deleted file mode 100644 index 6f38bce0..00000000 --- a/docs/introduction/plugins.mdx +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: 'Building Plugins' -slug: 'plugins' -sidebar_position: 4 ---- - -A plugin is an external process which communicates with Conduit and knows how to read/write records from/to a data source/destination (e.g. a database). - -When thinking about plugins for Conduit, our goals were to: -- provide a good development experience to plugin developers. -- ship Conduit with real built-in plugins (compiled into the Conduit binary), -- to make it as easy as possible to write plugins in _any_ programming language -- the Plugin SDK to be decoupled from Conduit and be able to change without changing Conduit itself. - -Have a look at -the [Connector SDK](https://github.com/conduitio/conduit-connector-sdk) to find -out how to build a Conduit connector written in Go. \ No newline at end of file diff --git a/docs/introduction/what-is-conduit.mdx b/docs/introduction/what-is-conduit.mdx index 3298864d..dfd30577 100644 --- a/docs/introduction/what-is-conduit.mdx +++ b/docs/introduction/what-is-conduit.mdx @@ -4,9 +4,9 @@ sidebar_position: 0 slug: "what-is-conduit" --- -import BetaAlert from "../../src/components/specific-alerts/BetaAlert"; - -Conduit is a new open-source project, written in [Go](https://go.dev/), that delivers real-time data integration for developers. It aims to provide the best user experience for building and running real-time data pipelines. +Conduit is a new open-source project, written in [Go](https://go.dev/), that +delivers real-time data integration for developers. It aims to provide the best +user experience for building and running real-time data pipelines. Out of the box, Conduit comes with: @@ -29,19 +29,14 @@ on multi-core machines. Conduit pipelines listen for changes to a database, data warehouse, etc., and allows your data applications to act upon those changes in real-time. -### It’s flexible - -Conduit pipelines listen for changes to a database, data warehouse, etc., and -allows your data applications to act upon those changes in real-time. - -### It’s Extensible +### It’s extensible Conduit connectors are plugins that communicate with Conduit via a gRPC interface. This means that plugins can be written in any language as long as -they conform to the required interface. More in-depth documentation regarding -plugins is in progress! +they conform to the required interface. Check out +our [connector docs](/docs/connectors/installing-connectors)! -## Get Involved +## Get involved Here are your next steps: diff --git a/static/images/standalone-vs-builtin.svg b/static/images/standalone-vs-builtin.svg new file mode 100644 index 00000000..95359987 --- /dev/null +++ b/static/images/standalone-vs-builtin.svg @@ -0,0 +1 @@ + \ No newline at end of file From 94ec753057f9cc88a7ee485d89f35d06dbb56848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Tue, 25 Oct 2022 17:24:59 +0200 Subject: [PATCH 4/8] Update docs/introduction/architecture.mdx Co-authored-by: Maha Hajja <82542081+maha-hajja@users.noreply.github.com> --- docs/introduction/architecture.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction/architecture.mdx b/docs/introduction/architecture.mdx index 69340fea..e0785a09 100644 --- a/docs/introduction/architecture.mdx +++ b/docs/introduction/architecture.mdx @@ -7,7 +7,7 @@ Here is an overview of the Conduit Architecture. ![Conduit Architecture](/images/conduit/conduit-diagram.svg) -Conduit is split in the following layers: +Conduit is split into the following layers: * **API layer** - exposes the public APIs used to communicate with Conduit. It exposes 2 types of APIs: * **gRPC** - this is the main API provided by Conduit. The gRPC API definition can be found in [api.proto](https://github.com/ConduitIO/conduit/blob/main/proto/api/v1/api.proto), it can be used to generate code for the client. From 1e59e10e2847b8f294a8d1c54ef4f460f4825e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Tue, 25 Oct 2022 17:25:32 +0200 Subject: [PATCH 5/8] Update docs/introduction/getting-started.mdx Co-authored-by: Maha Hajja <82542081+maha-hajja@users.noreply.github.com> --- docs/introduction/getting-started.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction/getting-started.mdx b/docs/introduction/getting-started.mdx index cb91b276..1c2266d6 100644 --- a/docs/introduction/getting-started.mdx +++ b/docs/introduction/getting-started.mdx @@ -41,7 +41,7 @@ tar zxvf conduit_0.3.0_Darwin_x86_64.tar.gz **Tip**: Depending on your operating system, you may need to run `chmod +x conduit` before running the binary. -4. Navigate to `http://localhost:8080`: +4. Navigate to `http://localhost:8080` to check Conduit's UI: ![Conduit Pipeline](/images/conduit/pipeline.png) From fd2589bee8cb6ab600d492702d8a13c35ace4685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Thu, 3 Nov 2022 16:14:04 +0100 Subject: [PATCH 6/8] fix review comments --- docs/configuration/pipeline-configuration-files.mdx | 11 +++++++---- docs/connectors/behavior.mdx | 2 +- docs/introduction/connectors.mdx | 4 ++-- docs/introduction/getting-started.mdx | 2 +- docs/introduction/{glossary.mdx => vocabulary.mdx} | 4 ++-- docs/introduction/what-is-conduit.mdx | 2 +- docs/processors/getting-started.md | 9 ++++----- 7 files changed, 18 insertions(+), 16 deletions(-) rename docs/introduction/{glossary.mdx => vocabulary.mdx} (96%) diff --git a/docs/configuration/pipeline-configuration-files.mdx b/docs/configuration/pipeline-configuration-files.mdx index 6e44f52c..aef86d66 100644 --- a/docs/configuration/pipeline-configuration-files.mdx +++ b/docs/configuration/pipeline-configuration-files.mdx @@ -7,6 +7,7 @@ Pipeline configuration files give you the ability to define pipelines that are p It's as simple as creating a YAML file that defines pipelines, connectors, processors, and their corresponding configurations. ## Getting started + Create a folder called `pipelines` at the same level as your Conduit binary file, add all your YAML files there, then run Conduit using the command: ``` @@ -22,6 +23,7 @@ the CLI flag `pipelines.path` and point to your file or directory: If your directory does not exist, Conduit will fail with an error: `"pipelines.path" config value is invalid` ### YAML Schema + The file in general has two root keys, the `version`, and the `pipelines` map. The map consists of other elements like `status` and `name`, which are configurations for the pipeline itself. @@ -78,19 +80,20 @@ if the processor has a pipeline parent, then the processor ID will be `connector has a connector parent, then the processor ID will be `pipelineID:connectorID:processorID`. ## Pipelines Immutability + Pipelines provisioned by configuration files are **immutable**, any updates needed on a provisioned pipeline have to be -done through the configuration file it was provisioned from. You can only control stopping and starting a pipeline +done through the configuration file. You can only control stopping and starting a pipeline through the UI or API. ### Updates and Deletes + Updates and deletes for a pipeline provisioned by configuration files can only be done through the configuration files. Changes should be made to the files, then Conduit has to be restarted to reload the changes. Any updates or deletes done through the API or UI will be prohibited. -* To delete a pipeline: simply delete it from the `pipelines` map from the configuration file, then run conduit again. -* To update a pipeline: change any field value from the configuration file, and run conduit again to address these updates. +* To delete a pipeline: simply delete it from the `pipelines` map from the configuration file, then run Conduit again. +* To update a pipeline: change any field value from the configuration file, and run Conduit again to address these updates. Updates will preserve the status of the pipeline, and will continue working from where it stopped. However, the pipeline will start from the beginning of the source and will not continue from where it stopped, if one of these values were updated: {`pipeline ID`, `connector ID`, `connector plugin`, `connector type`}. - diff --git a/docs/connectors/behavior.mdx b/docs/connectors/behavior.mdx index 022d6490..b94da978 100644 --- a/docs/connectors/behavior.mdx +++ b/docs/connectors/behavior.mdx @@ -48,7 +48,7 @@ Conduit and communicate with Conduit through Go channels instead of gRPC. Any connector written in Go can be compiled into the Conduit binary and used as a built-in connector. -Find ouy more about the [Conduit connector plugin architecture](https://github.com/ConduitIO/conduit/blob/main/docs/architecture-decision-records/20220121-conduit-plugin-architecture.md). +Find out more about the [Conduit connector plugin architecture](https://github.com/ConduitIO/conduit/blob/main/docs/architecture-decision-records/20220121-conduit-plugin-architecture.md). ## Protocol gRPC Interface diff --git a/docs/introduction/connectors.mdx b/docs/introduction/connectors.mdx index 48cbee8d..a91ddd82 100644 --- a/docs/introduction/connectors.mdx +++ b/docs/introduction/connectors.mdx @@ -4,8 +4,8 @@ slug: 'connectors' sidebar_position: 4 --- -A connector is an external process which communicates with Conduit and knows how -to read/write records from/to a data source/destination (e.g. a database). +A connector runs in a separate process which communicates with Conduit and knows +how to read/write records from/to a data source/destination (e.g. a database). When thinking about connectors for Conduit, our goals were to: - provide a good development experience to connector developers, diff --git a/docs/introduction/getting-started.mdx b/docs/introduction/getting-started.mdx index 1c2266d6..95098c0f 100644 --- a/docs/introduction/getting-started.mdx +++ b/docs/introduction/getting-started.mdx @@ -16,7 +16,7 @@ Conduit is a data integration tool for software engineers. Its purpose is to help you move data from A to B. You can use Conduit to send data from Kafka to Postgres, between files and APIs, between [supported connectors](https://github.com/ConduitIO/conduit/blob/main/docs/connectors.md), -and [any datastore you can build a plugin for](/docs/introduction/plugins). +and [any datastore you can build a plugin for](/docs/connectors/building-connectors). It's written in [GoLang](https://go.dev/), compiles to a binary, and is designed to be easy to use and [deploy](https://docs.conduit.io/docs/Deploy/overview). diff --git a/docs/introduction/glossary.mdx b/docs/introduction/vocabulary.mdx similarity index 96% rename from docs/introduction/glossary.mdx rename to docs/introduction/vocabulary.mdx index 73982fc8..fdbb683d 100644 --- a/docs/introduction/glossary.mdx +++ b/docs/introduction/vocabulary.mdx @@ -1,6 +1,6 @@ --- -title: "Glossary" -slug: "glossary" +title: "Vocabulary" +slug: "vocabulary" --- * **Pipeline** - a pipeline receives records from one or multiple source diff --git a/docs/introduction/what-is-conduit.mdx b/docs/introduction/what-is-conduit.mdx index dfd30577..4e5c04d0 100644 --- a/docs/introduction/what-is-conduit.mdx +++ b/docs/introduction/what-is-conduit.mdx @@ -15,7 +15,7 @@ Out of the box, Conduit comes with: - processors - observability -The 🔋 are included. +The batteries are included 🔋 ### It’s simple, yet powerful diff --git a/docs/processors/getting-started.md b/docs/processors/getting-started.md index 56f2abf9..b2c684d1 100644 --- a/docs/processors/getting-started.md +++ b/docs/processors/getting-started.md @@ -35,9 +35,8 @@ Processors are created through the `/processors` endpoint. Here's an example: ```json lines POST /v1/processors { - // name of the processor in Conduit - // note that this is NOT a user-defined name for this processor - "name": "extractfieldpayload", + // type of the processor in Conduit + "type": "extractfieldpayload", "parent": { // type of parent: TYPE_CONNECTOR or TYPE_PIPELINE @@ -82,7 +81,7 @@ Here's an example of a request payload to create a JavaScript processor: ```json { - "name": "js", + "type": "js", "parent": { "type": "TYPE_CONNECTOR", "id": "d1ae72ea-9d9c-4bb2-b993-fdb7a01825ab" @@ -94,7 +93,7 @@ Here's an example of a request payload to create a JavaScript processor: } } ``` -The above will create a JavaScript processor (`"name": "js"`), attached to a connector (for the parent, we have +The above will create a JavaScript processor (`"type": "js"`), attached to a connector (for the parent, we have `"type": "TYPE_CONNECTOR"`). The script used is: ```javascript function process(record) { From f4d55da3eadee0480ac8a4c996ea1ea880d332b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Thu, 3 Nov 2022 16:20:52 +0100 Subject: [PATCH 7/8] add doc about referencing connectors --- docs/connectors/behavior.mdx | 2 +- docs/connectors/building.mdx | 2 +- docs/connectors/installing.mdx | 19 +++++++++++----- docs/connectors/referencing.mdx | 40 +++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 docs/connectors/referencing.mdx diff --git a/docs/connectors/behavior.mdx b/docs/connectors/behavior.mdx index b94da978..960f137b 100644 --- a/docs/connectors/behavior.mdx +++ b/docs/connectors/behavior.mdx @@ -2,7 +2,7 @@ title: "Connector Behavior" sidebar_label: "Behavior" slug: "behavior" -sidebar_position: 1 +sidebar_position: 2 --- This document provides insights on how Conduit communicates with a connector. diff --git a/docs/connectors/building.mdx b/docs/connectors/building.mdx index c27beeab..6527396a 100644 --- a/docs/connectors/building.mdx +++ b/docs/connectors/building.mdx @@ -1,7 +1,7 @@ --- title: "Building Connectors" slug: "building-connectors" -sidebar_position: 2 +sidebar_position: 3 --- Conduit connectors can be built in any programming language that supports gRPC. diff --git a/docs/connectors/installing.mdx b/docs/connectors/installing.mdx index d6886586..cd76cab1 100644 --- a/docs/connectors/installing.mdx +++ b/docs/connectors/installing.mdx @@ -33,11 +33,18 @@ system and architecture. Alternatively you can build the binary yourself (for instructions on building a connector please refer to the readme of that specific connector). -### Installing a Connector in Conduit +## Installing a Connector in Conduit -Conduit loads standalone connectors at startup. By default, it looks for -binaries in the folder `connectors` relative to the Conduit binary (you can -adjust this with the CLI flag `-connectors.path`). +Conduit loads standalone connectors at startup. The connector binaries need to +be placed in the `connectors` directory relative to the Conduit binary so +Conduit can find them. Alternatively, the path to the standalone connectors can +be adjusted using the CLI flag `-connectors.path`, for example: -To install a connector you need to place the connector binary in the connectors -folder and restart Conduit. +```shell +./conduit -connectors.path=/path/to/connectors/ +``` + +Names of the connector binaries are not important, since Conduit is getting the +information about connectors from connectors themselves (using their gRPC API). + +Find out how to [reference your connector](/docs/connectors/referencing-connectors). \ No newline at end of file diff --git a/docs/connectors/referencing.mdx b/docs/connectors/referencing.mdx new file mode 100644 index 00000000..c4266da3 --- /dev/null +++ b/docs/connectors/referencing.mdx @@ -0,0 +1,40 @@ +--- +title: "Referencing Connectors" +slug: "referencing-connectors" +sidebar_position: 1 +--- + +The name used to reference a connector in API requests (e.g. to create a new +connector) comes in the following format: + +`[PLUGIN-TYPE:]PLUGIN-NAME[@VERSION]` + +- `PLUGIN-TYPE` (`builtin`, `standalone` or `any`) + - Defines if the specified plugin should be builtin or standalone. + - If `any`, Conduit will use a standalone plugin if it exists and fall back to + a builtin plugin. + - Default is `any`. +- `PLUGIN-NAME` + - Defines the name of the plugin as specified in the plugin specifications, it + has to be an exact match. +- `VERSION` + - Defines the plugin version as specified in the plugin specifications, it has + to be an exact match. + - If `latest`, Conduit will use the latest semantic version. + - Default is `latest`. + +Examples: + +- `postgres` + - will use the **latest** **standalone** **postgres** plugin + - will fallback to the **latest** **builtin** **postgres** plugin if + standalone wasn't found +- `postgres@v0.2.0` + - will use the **standalone** **postgres** plugin with version **v0.2.0** + - will fallback to a **builtin** **postgres** plugin with version **v0.2.0** + if standalone wasn't found +- `builtin:postgres` + - will use the **latest** **builtin** **postgres** plugin +- `standalone:postgres@v0.3.0` + - will use the **standalone** **postgres** plugin with version **v0.3.0** (no + fallback to builtin) From 57e3a2869a7fecf49936376c36f1b613aff58efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lovro=20Ma=C5=BEgon?= Date: Fri, 4 Nov 2022 14:37:53 +0100 Subject: [PATCH 8/8] simplify connector introduction --- docs/introduction/connectors.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/introduction/connectors.mdx b/docs/introduction/connectors.mdx index a91ddd82..d46718e6 100644 --- a/docs/introduction/connectors.mdx +++ b/docs/introduction/connectors.mdx @@ -4,8 +4,8 @@ slug: 'connectors' sidebar_position: 4 --- -A connector runs in a separate process which communicates with Conduit and knows -how to read/write records from/to a data source/destination (e.g. a database). +A connector knows how to read/write records from/to a data source/destination +(e.g. a database). When thinking about connectors for Conduit, our goals were to: - provide a good development experience to connector developers, @@ -15,4 +15,4 @@ When thinking about connectors for Conduit, our goals were to: decoupled from Conduit and be able to change without changing Conduit itself. Have a look at our [connector docs](/docs/connectors/installing-connectors) to -find out more! \ No newline at end of file +find out more!