From 5068e015e634d4f5b8bff87477020f64f5829e65 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 2 May 2024 10:46:36 +0300 Subject: [PATCH 01/14] Add rest_api source docs --- .../verified-sources/rest_api.md | 220 ++++++++++++++++++ docs/website/sidebars.js | 1 + 2 files changed, 221 insertions(+) create mode 100644 docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md new file mode 100644 index 0000000000..8500ef6cbb --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -0,0 +1,220 @@ +--- +title: REST API generic source +description: dlt verified source for REST APIs +keywords: [rest api, restful api] +--- +import Header from './_source-info-header.md'; + +# REST API Generic Source + +
+ +This is a generic dlt source you can use to extract data from any REST API. It uses declarative configuration to define the API endpoints, their relationships, parameters, pagination, and authentication. + +## Setup Guide + +### Initialize the verified source + +Enter the following command: + + ```sh + dlt init rest_api duckdb + ``` + +[dlt init](../../reference/command-line-interface) will initialize the pipeline example with REST API as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). + +## Add credentials + +In the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. + +The GitHub API requires an access token to be set in the `secrets.toml` file. +Here is an example of how to set the token in the `secrets.toml` file: + +```toml +[sources.rest_api.github] +github_token = "your_github_token" +``` + +## Run the pipeline + +1. Install the required dependencies by running the following command: + + ```sh + pip install -r requirements.txt + ``` + +2. Run the pipeline: + + ```sh + python rest_api_pipeline.py + ``` + +3. Verify that everything loaded correctly by using the following command: + + ```sh + dlt pipeline rest_api show + ``` + +## Source Configuration + +Let's take a look at the GitHub example in `rest_api_pipeline.py` file: + +```python +def load_github() -> None: + pipeline = dlt.pipeline( + pipeline_name="rest_api_github", + destination="duckdb", + dataset_name="rest_api_data", + ) + + github_config: RESTAPIConfig = { + "client": { + "base_url": "https://api.github.com/repos/dlt-hub/dlt/", + "auth": { + "token": dlt.secrets["github_token"], + }, + }, + "resource_defaults": { + "primary_key": "id", + "write_disposition": "merge", + "endpoint": { + "params": { + "per_page": 100, + }, + }, + }, + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "issues", + "params": { + "sort": "updated", + "direction": "desc", + "state": "open", + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + }, + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + "include_from_parent": ["id"], + }, + ], + } + + github_source = rest_api_source(github_config) + + load_info = pipeline.run(github_source) + print(load_info) +``` + +The declarative configuration is defined in the `github_config` dictionary. It contains the following key components: + +1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file. + +2. `resource_defaults`: Contains default settings for all resources. + +3. `resources`: A list of resources to be loaded. In this example, we have two resources: `issues` and `issue_comments`. Which correspond to the GitHub API endpoints for issues and issue comments. + +Each resource has a name and an endpoint configuration. The endpoint configuration includes: + +- `path`: The path to the API endpoint. +- `method`: The HTTP method to be used. Default is `GET`. +- `params`: Query parameters to be sent with each request. For example, `sort` to order the results. +- `json`: The JSON payload to be sent with the request (for POST and PUT requests). +- `paginator`: Configuration for paginating the results. +- `data_selector`: A JSON path to select the data from the response. +- `response_actions`: A list of actions that define how to process the response data. +- `incremental`: Configuration for incremental loading. + +When you pass this configuration to the `rest_api_source` function, it creates a dlt source object that can be used with the pipeline. + +`rest_api_source` function takes the following arguments: + +- `config`: The REST API configuration dictionary. +- `name`: An optional name for the source. +- `section`: An optional section name in the configuration file. +- `max_table_nesting`: Sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. +- `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. +- `schema_contract`: Schema contract settings that will be applied to this resource. +- `spec`: A specification of configuration and secret values required by the source. + +## Define Resource Relationships + +When you have a resource that depends on another resource, you can define the relationship using the resolve field type. + +In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource. + +```python +{ + "name": "issue_comments", + "endpoint": { + "path": "issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, +}, +``` + +This configuration tells the source to get issue numbers from the `issues` resource and use them to fetch comments for each issue. + +## Incremental Loading + +To set up incremental loading for a resource, you can use two options: + +1. Defining a special parameter in the `params` section of the endpoint configuration: + + ```python + "": { + "type": "incremental", + "cursor_path": "", + "initial_value": "", + }, + ``` + + For example, in the `issues` resource configuration in the GitHub example, we have: + + ```python + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + ``` + + This configuration tells the source to create an incremental object that will keep track of the `updated_at` field in the response and use it as a value for the `since` parameter in subsequent requests. + +2. Specifying the `incremental` field in the endpoint configuration: + + ```python + "incremental": { + "start_param": "", + "end_param": "", + "cursor_path": "", + "initial_value": "", + "end_value": "", + }, + ``` + + This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. + diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 728c3b6593..a3fe12c8fb 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -83,6 +83,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/notion', 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', + 'dlt-ecosystem/verified-sources/rest_api', 'dlt-ecosystem/verified-sources/salesforce', 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', From a32e81bc1dcb6afb4b50b8e9e55e9a8d0a5c2679 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 10 May 2024 13:20:18 +0200 Subject: [PATCH 02/14] Expand rest_api documentation --- .../verified-sources/rest_api.md | 352 +++++++++++++++--- 1 file changed, 309 insertions(+), 43 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 8500ef6cbb..197b1f9160 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -5,30 +5,44 @@ keywords: [rest api, restful api] --- import Header from './_source-info-header.md'; -# REST API Generic Source -
This is a generic dlt source you can use to extract data from any REST API. It uses declarative configuration to define the API endpoints, their relationships, parameters, pagination, and authentication. -## Setup Guide +## Setup guide ### Initialize the verified source -Enter the following command: +Enter the following command in your terminal: - ```sh - dlt init rest_api duckdb - ``` +```sh +dlt init rest_api duckdb +``` -[dlt init](../../reference/command-line-interface) will initialize the pipeline example with REST API as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). +[dlt init](../../reference/command-line-interface) will initialize the pipeline examples for REST API as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -## Add credentials +Running `dlt init` creates the following in the current folder: +- `rest_api_pipeline.py` file with a sample pipelines definition: + - GitHub API example + - Pokemon API example +- `.dlt` folder with: + - `secrets.toml` file to store your access tokens and other sensitive information + - `config.toml` file to store the configuration settings +- `requirements.txt` file with the required dependencies + +Change the REST API source to your needs by modifying the `rest_api_pipeline.py` file. See the detailed [source configuration](#source-configuration) section below. + +:::note +For the rest of the guide, we will use the [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28) and [Pokemon API](https://pokeapi.co/) as example sources. +::: + +### Add credentials In the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. -The GitHub API requires an access token to be set in the `secrets.toml` file. -Here is an example of how to set the token in the `secrets.toml` file: +The GitHub API [requires an access token](https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api?apiVersion=2022-11-28) to access some of its endpoints and to increase the rate limit for the API calls. To get a GitHub token, follow the GitHub documentation on [managing your personal access tokens](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens). + +After you get the token, add it to the `secrets.toml` file: ```toml [sources.rest_api.github] @@ -55,7 +69,9 @@ github_token = "your_github_token" dlt pipeline rest_api show ``` -## Source Configuration +## Source configuration + +### Quick example Let's take a look at the GitHub example in `rest_api_pipeline.py` file: @@ -123,66 +139,305 @@ def load_github() -> None: print(load_info) ``` -The declarative configuration is defined in the `github_config` dictionary. It contains the following key components: +The declarative resource configuration is defined in the `github_config` dictionary. It contains the following key components: 1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file. -2. `resource_defaults`: Contains default settings for all resources. +2. `resource_defaults`: Contains default settings for all resources. In this example, we define that all resources: + - Have `id` as the [primary key](../../general-usage/resource#define-schema) + - Use the `merge` [write disposition](../../general-usage/incremental-loading#choosing-a-write-disposition) to merge the data with the existing data in the destination. + - Send a `per_page` query parameter with each request to 100 to get more results per page. + +3. `resources`: A list of resources to be loaded. In this example, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need a in issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. + +Let's break down the configuration in more detail. + +### Configuration structure + +:::tip +Import the `RESTAPIConfig` type from the `rest_api` module to have convenient hints in your editor/IDE: + +```python +from rest_api import RESTAPIConfig +``` +::: + + +The configuration object passed to the REST API Generic Source has three main elements: + +```py +config: RESTAPIConfig = { + "client": { + ... + }, + "resource_defaults": { + ... + }, + "resources": [ + ... + ], +} +``` + +#### `client` + +`client` contains the configuration to connect to the API's endpoints. It includes the following fields: + +- `base_url` (str): The base URL of the API. This string is prepended to all endpoint paths. For example, if the base URL is `https://api.example.com/v1/`, and the endpoint path is `users`, the full URL will be `https://api.example.com/v1/users`. +- `headers` (dict, optional): Additional headers to be sent with each request. +- `auth` (optional): Authentication configuration. It can be a simple token, a `AuthConfigBase` object, or a more complex authentication method. +- `paginator` (optional): Configuration for the default pagination to be used for resources that support pagination. See the [pagination](#pagination) section for more details. + +#### `resource_defaults` (optional) + +`resource_defaults` contains the default values to configure the dlt resources. This configuration is applied to all resources unless overridden by the resource-specific configuration. + +For example, you can set the primary key, write disposition, and other default settings here: + +```py +config = { + "client": { + ... + }, + "resource_defaults": { + "primary_key": "id", + "write_disposition": "merge", + "endpoint": { + "params": { + "per_page": 100, + }, + }, + }, + "resources": [ + "resource1", + "resource2": { + "name": "resource2_name", + "write_disposition": "append", + "endpoint": { + "params": { + "param1": "value1", + }, + }, + }, + ], +} +``` + +Above, all resources will have `primary_key` set to `id`, `resource1` will have `write_disposition` set to `merge`, and `resource2` will override the default `write_disposition` with `append`. +Both `resource1` and `resource2` will have the `per_page` parameter set to 100. + +#### `resources` + +This is a list of resource configurations that define the API endpoints to be loaded. Each resource configuration can be: +- a dictionary with the [resource configuration](#resource-configuration). +- a string. In this case, the string is used as the both as the endpoint path and the resource name, and the resource configuration is taken from the `resource_defaults` configuration if it exists. + +### Resource configuration -3. `resources`: A list of resources to be loaded. In this example, we have two resources: `issues` and `issue_comments`. Which correspond to the GitHub API endpoints for issues and issue comments. +A resource configuration has the following fields: -Each resource has a name and an endpoint configuration. The endpoint configuration includes: +- `endpoint`: The endpoint configuration for the resource. It can be a string or a dict representing the endpoint settings. See the [endpoint configuration](#endpoint-configuration) section for more details. +- `write_disposition`: The write disposition for the resource. +- `primary_key`: The primary key for the resource. +- `include_from_parent`: A list of fields from the parent resource to be included in the resource output. +- `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. + +### Endpoint configuration + +The endpoint configuration defines how to query the API endpoint. Quick example: + +```py +{ + "path": "issues", + "method": "GET", + "params": { + "sort": "updated", + "direction": "desc", + "state": "open", + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + }, + }, + "data_selector": "results", +} +``` + +The fields in the endpoint configuration are: - `path`: The path to the API endpoint. - `method`: The HTTP method to be used. Default is `GET`. - `params`: Query parameters to be sent with each request. For example, `sort` to order the results. - `json`: The JSON payload to be sent with the request (for POST and PUT requests). -- `paginator`: Configuration for paginating the results. -- `data_selector`: A JSON path to select the data from the response. +- `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. +- `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. - `response_actions`: A list of actions that define how to process the response data. - `incremental`: Configuration for incremental loading. -When you pass this configuration to the `rest_api_source` function, it creates a dlt source object that can be used with the pipeline. +### Pagination -`rest_api_source` function takes the following arguments: +The REST API source will try to automatically handle pagination for you. This works by detecting the pagination details from the first API response. -- `config`: The REST API configuration dictionary. -- `name`: An optional name for the source. -- `section`: An optional section name in the configuration file. -- `max_table_nesting`: Sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. -- `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. -- `schema_contract`: Schema contract settings that will be applied to this resource. -- `spec`: A specification of configuration and secret values required by the source. +In some special cases, you may need to specify the pagination configuration explicitly. -## Define Resource Relationships +These are the available paginator types: -When you have a resource that depends on another resource, you can define the relationship using the resolve field type. +| Paginator type | String Alias | Description | +| -------------- | ------------ | ----------- | +| JSONResponsePaginator | `json_links` | The links to the next page are in the body (JSON) of the response. | +| HeaderLinkPaginator | `header_links` | The links to the next page are in the response headers. | +| OffsetPaginator | `offset` | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided. | +| PageNumberPaginator | `page_number` | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided. | +| JSONCursorPaginator | `json_cursor` | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON). | +| SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | -In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource. +To specify the pagination configuration, you can use the `paginator` field in the endpoint configuration: ```python { - "name": "issue_comments", - "endpoint": { - "path": "issues/{issue_number}/comments", - "params": { - "issue_number": { - "type": "resolve", - "resource": "issues", - "field": "number", - } + "path": "issues", + "paginator": { + "type": "json_links", + "next_url_path": "paging.next", + }, +} +``` +### Data selection + +The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect locations of the data automatically. + +Use this field when you need to specify the location of the data in the response explicitly. + +For example, if the API response looks like this: + +```json +{ + "posts": [ + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"} + ] +} +``` + +You can use the following endpoint configuration: + +```python +{ + "path": "posts", + "data_selector": "posts", +} +``` + +For a nested structure like this: + +```json +{ + "results": { + "posts": [ + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"} + ] + } +} +``` + +You can use the following endpoint configuration: + +```python +{ + "path": "posts", + "data_selector": "results.posts", +} +``` + +Read more about [JSONPath syntax](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to learn how to write selectors. + + +### Authentication + +Many APIs require authentication to access their endpoints. The REST API source supports various authentication methods, such as token-based, query parameters, basic auth, etc. + +#### Quick example + +One of the most common method is token-based authentication. To authenticate with a token, you can use the `token` field in the `auth` configuration: + +```python +{ + "client": { + ... + "auth": { + "token": dltd.secrets["your_api_token"], }, + ... }, -}, +} +``` + +:::warning +Make sure to store your access tokens and other sensitive information in the `secrets.toml` file and never commit it to the version control system. +::: + +Available authentication methods: + +| Authentication type | Description | +| ------------------- | ----------- | +| BearTokenAuth | Bearer token authentication. | +| HTTPBasicAuth | Basic HTTP authentication. | +| APIKeyAuth | API key authentication with key defined in the query parameters or in the headers. | + +### Define resource relationships + +When you have a resource that depends on another resource, you can define the relationship using the resolve field type. + +In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource: + +```py +{ + "resources": [ + { + "name": "issues", + "endpoint": { + "path": "issues", + ... + }, + }, + { + "name": "issue_comments", + "endpoint": { + "path": "issues/{issue_number}/comments", + "params": { + "issue_number": { + "type": "resolve", + "resource": "issues", + "field": "number", + } + }, + }, + }, + ], +} ``` This configuration tells the source to get issue numbers from the `issues` resource and use them to fetch comments for each issue. -## Incremental Loading +The syntax for the `resolve` field in parameter configuration is: + +```py +"": { + "type": "resolve", + "resource": "", + "field": "", +} +``` + +## Incremental loading To set up incremental loading for a resource, you can use two options: -1. Defining a special parameter in the `params` section of the endpoint configuration: +1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): ```python "": { @@ -204,7 +459,7 @@ To set up incremental loading for a resource, you can use two options: This configuration tells the source to create an incremental object that will keep track of the `updated_at` field in the response and use it as a value for the `since` parameter in subsequent requests. -2. Specifying the `incremental` field in the endpoint configuration: +2. Specifying the `incremental` field in the [endpoint configuration](#endpoint-configuration): ```python "incremental": { @@ -218,3 +473,14 @@ To set up incremental loading for a resource, you can use two options: This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. +## `rest_api_source()` function + +`rest_api_source` function takes the following arguments: + +- `config`: The REST API configuration dictionary. +- `name`: An optional name for the source. +- `section`: An optional section name in the configuration file. +- `max_table_nesting`: Sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. +- `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. +- `schema_contract`: Schema contract settings that will be applied to this resource. +- `spec`: A specification of configuration and secret values required by the source. From 6f87d70b4da8fb0c27ab51e39e4abdec02f2c96d Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 10 May 2024 13:58:12 +0200 Subject: [PATCH 03/14] Update snippets --- .../dlt-ecosystem/verified-sources/rest_api.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 197b1f9160..22ff2a49da 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -75,7 +75,7 @@ github_token = "your_github_token" Let's take a look at the GitHub example in `rest_api_pipeline.py` file: -```python +```py def load_github() -> None: pipeline = dlt.pipeline( pipeline_name="rest_api_github", @@ -157,7 +157,7 @@ Let's break down the configuration in more detail. :::tip Import the `RESTAPIConfig` type from the `rest_api` module to have convenient hints in your editor/IDE: -```python +```py from rest_api import RESTAPIConfig ``` ::: @@ -294,7 +294,7 @@ These are the available paginator types: To specify the pagination configuration, you can use the `paginator` field in the endpoint configuration: -```python +```py { "path": "issues", "paginator": { @@ -323,7 +323,7 @@ For example, if the API response looks like this: You can use the following endpoint configuration: -```python +```py { "path": "posts", "data_selector": "posts", @@ -346,7 +346,7 @@ For a nested structure like this: You can use the following endpoint configuration: -```python +```py { "path": "posts", "data_selector": "results.posts", @@ -364,7 +364,7 @@ Many APIs require authentication to access their endpoints. The REST API source One of the most common method is token-based authentication. To authenticate with a token, you can use the `token` field in the `auth` configuration: -```python +```py { "client": { ... @@ -439,7 +439,7 @@ To set up incremental loading for a resource, you can use two options: 1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): - ```python + ```py "": { "type": "incremental", "cursor_path": "", @@ -449,7 +449,7 @@ To set up incremental loading for a resource, you can use two options: For example, in the `issues` resource configuration in the GitHub example, we have: - ```python + ```py "since": { "type": "incremental", "cursor_path": "updated_at", @@ -461,7 +461,7 @@ To set up incremental loading for a resource, you can use two options: 2. Specifying the `incremental` field in the [endpoint configuration](#endpoint-configuration): - ```python + ```py "incremental": { "start_param": "", "end_param": "", From f4b6094c118bd718d3601c6a04c21abc08ffd35b Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 10 May 2024 14:34:44 +0200 Subject: [PATCH 04/14] Update string aliases --- .../verified-sources/rest_api.md | 71 +++++++++++++++---- 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 22ff2a49da..d3aa48a7d9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -36,6 +36,8 @@ Change the REST API source to your needs by modifying the `rest_api_pipeline.py` For the rest of the guide, we will use the [GitHub API](https://docs.github.com/en/rest?apiVersion=2022-11-28) and [Pokemon API](https://pokeapi.co/) as example sources. ::: +This source is based on the [RESTClient class](../../general-usage/http/rest-client.md). + ### Add credentials In the `.dlt` folder, you'll find a file called `secrets.toml`, where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. @@ -283,26 +285,38 @@ In some special cases, you may need to specify the pagination configuration expl These are the available paginator types: -| Paginator type | String Alias | Description | +| Paginator class | String Alias (`type`) | Description | | -------------- | ------------ | ----------- | -| JSONResponsePaginator | `json_links` | The links to the next page are in the body (JSON) of the response. | -| HeaderLinkPaginator | `header_links` | The links to the next page are in the response headers. | -| OffsetPaginator | `offset` | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided. | -| PageNumberPaginator | `page_number` | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided. | -| JSONCursorPaginator | `json_cursor` | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON). | +| [JSONResponsePaginator](../../general-usage/http/rest-client.md#jsonresponsepaginator) | `json_response` | The links to the next page are in the body (JSON) of the response. | +| [HeaderLinkPaginator](../../general-usage/http/rest-client.md#headerlinkpaginator) | `header_link` | The links to the next page are in the response headers. | +| [OffsetPaginator](../../general-usage/http/rest-client.md#offsetpaginator) | `offset` | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided. | +| [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | `page_number` | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided. | +| [JSONCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | `cursor` | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON). | | SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | -To specify the pagination configuration, you can use the `paginator` field in the endpoint configuration: +To specify the pagination configuration, use the `paginator` field in the [client](#client) or [endpoint](#endpoint-configuration) configurations. You may use either the paginator class or the string alias in the `type` field along with the required parameters. ```py { - "path": "issues", + ... "paginator": { "type": "json_links", "next_url_path": "paging.next", - }, + } +} +``` + +Or using the paginator instance: + +```py +{ + ... + "paginator": JSONResponsePaginator( + next_url_path="paging.next" + ), } ``` + ### Data selection The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect locations of the data automatically. @@ -380,13 +394,40 @@ One of the most common method is token-based authentication. To authenticate wit Make sure to store your access tokens and other sensitive information in the `secrets.toml` file and never commit it to the version control system. ::: -Available authentication methods: +Available authentication types: + +| Authentication class | String Alias (`type`) | Description | +| ------------------- | ----------- | ----------- | +| [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | +| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `api_key` | Basic HTTP authentication. | +| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `http_basic` | API key authentication with key defined in the query parameters or in the headers. | + +To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: + +```py +{ + "client": { + "auth": { + "type": "bearer", + "token": dltd.secrets["your_api_token"], + }, + ... + }, +} +``` + +Alternatively, you can use the authentication class directly: + +```py +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth -| Authentication type | Description | -| ------------------- | ----------- | -| BearTokenAuth | Bearer token authentication. | -| HTTPBasicAuth | Basic HTTP authentication. | -| APIKeyAuth | API key authentication with key defined in the query parameters or in the headers. | +config = { + "client": { + "auth": BearTokenAuth(dltd.secrets["your_api_token"]), + }, + ... +} +``` ### Define resource relationships From 6d67c3427252c9c8d8cfdf7934a865c1719404f1 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 10 May 2024 14:42:47 +0200 Subject: [PATCH 05/14] Link dlt source --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index d3aa48a7d9..892ea432bd 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -514,9 +514,9 @@ To set up incremental loading for a resource, you can use two options: This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. -## `rest_api_source()` function +## Advanced configuration -`rest_api_source` function takes the following arguments: +`rest_api_source()` function creates the [dlt source](../../general-usage/source.md) and lets you configure the following parameters: - `config`: The REST API configuration dictionary. - `name`: An optional name for the source. From 1de8b026d56dbae06ff5261a39d2f475a9c305b1 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 10 May 2024 14:46:23 +0200 Subject: [PATCH 06/14] Fix typo --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 892ea432bd..d199440fad 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -409,7 +409,7 @@ To specify the authentication configuration, use the `auth` field in the [client "client": { "auth": { "type": "bearer", - "token": dltd.secrets["your_api_token"], + "token": dlt.secrets["your_api_token"], }, ... }, From 478cb5f4ae7e9ffd149d6048f5f2b9e1b8fc22c1 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Sat, 11 May 2024 10:07:21 +0200 Subject: [PATCH 07/14] Fix typo --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index d199440fad..4fc7449e59 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -383,7 +383,7 @@ One of the most common method is token-based authentication. To authenticate wit "client": { ... "auth": { - "token": dltd.secrets["your_api_token"], + "token": dlt.secrets["your_api_token"], }, ... }, @@ -423,7 +423,7 @@ from dlt.sources.helpers.rest_client.auth import BearerTokenAuth config = { "client": { - "auth": BearTokenAuth(dltd.secrets["your_api_token"]), + "auth": BearTokenAuth(dlt.secrets["your_api_token"]), }, ... } From a89b15d54f2e54ad37d879cbda5054141b373139 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 13:55:33 +0200 Subject: [PATCH 08/14] Reordered code in the example and added a new section --- .../verified-sources/rest_api.md | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 4fc7449e59..9ec16ebd87 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -7,7 +7,7 @@ import Header from './_source-info-header.md';
-This is a generic dlt source you can use to extract data from any REST API. It uses declarative configuration to define the API endpoints, their relationships, parameters, pagination, and authentication. +This is a generic dlt source you can use to extract data from any REST API. It uses [declarative configuration](#source-configuration) to define the API endpoints, their [relationships](#define-resource-relationships), how to handle [pagination](#pagination), and [authentication](#authentication). ## Setup guide @@ -62,14 +62,14 @@ github_token = "your_github_token" 2. Run the pipeline: ```sh - python rest_api_pipeline.py - ``` + python rest_api_pipeline.py + ``` 3. Verify that everything loaded correctly by using the following command: - ```sh - dlt pipeline rest_api show - ``` + ```sh + dlt pipeline rest_api show + ``` ## Source configuration @@ -79,12 +79,6 @@ Let's take a look at the GitHub example in `rest_api_pipeline.py` file: ```py def load_github() -> None: - pipeline = dlt.pipeline( - pipeline_name="rest_api_github", - destination="duckdb", - dataset_name="rest_api_data", - ) - github_config: RESTAPIConfig = { "client": { "base_url": "https://api.github.com/repos/dlt-hub/dlt/", @@ -137,6 +131,12 @@ def load_github() -> None: github_source = rest_api_source(github_config) + pipeline = dlt.pipeline( + pipeline_name="rest_api_github", + destination="duckdb", + dataset_name="rest_api_data", + ) + load_info = pipeline.run(github_source) print(load_info) ``` @@ -145,19 +145,19 @@ The declarative resource configuration is defined in the `github_config` diction 1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file. -2. `resource_defaults`: Contains default settings for all resources. In this example, we define that all resources: +2. `resource_defaults`: Contains default settings for all [resources](#resource-configuration). In this example, we define that all resources: - Have `id` as the [primary key](../../general-usage/resource#define-schema) - Use the `merge` [write disposition](../../general-usage/incremental-loading#choosing-a-write-disposition) to merge the data with the existing data in the destination. - Send a `per_page` query parameter with each request to 100 to get more results per page. -3. `resources`: A list of resources to be loaded. In this example, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need a in issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. +3. `resources`: A list of [resources](#resource-configuration) to be loaded. Here, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need a in issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. Let's break down the configuration in more detail. ### Configuration structure :::tip -Import the `RESTAPIConfig` type from the `rest_api` module to have convenient hints in your editor/IDE: +Import the `RESTAPIConfig` type from the `rest_api` module to have convenient hints in your editor/IDE and use it to define the configuration object. ```py from rest_api import RESTAPIConfig @@ -192,7 +192,7 @@ config: RESTAPIConfig = { #### `resource_defaults` (optional) -`resource_defaults` contains the default values to configure the dlt resources. This configuration is applied to all resources unless overridden by the resource-specific configuration. +`resource_defaults` contains the default values to [configure the dlt resources](#resource-configuration). This configuration is applied to all resources unless overridden by the resource-specific configuration. For example, you can set the primary key, write disposition, and other default settings here: @@ -236,14 +236,16 @@ This is a list of resource configurations that define the API endpoints to be lo ### Resource configuration -A resource configuration has the following fields: +A resource configuration is used to define a [dlt resource](../../general-usage/resource.md) for the data to be loaded from an API endpoint. It contains the following key fields: - `endpoint`: The endpoint configuration for the resource. It can be a string or a dict representing the endpoint settings. See the [endpoint configuration](#endpoint-configuration) section for more details. - `write_disposition`: The write disposition for the resource. - `primary_key`: The primary key for the resource. -- `include_from_parent`: A list of fields from the parent resource to be included in the resource output. +- `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. +You can also pass additional resource parameters that will be used to configure the dlt resource. See [dlt resource API reference](../../api_reference/extract/decorators.md#resource) for more details. + ### Endpoint configuration The endpoint configuration defines how to query the API endpoint. Quick example: @@ -283,7 +285,11 @@ The REST API source will try to automatically handle pagination for you. This wo In some special cases, you may need to specify the pagination configuration explicitly. -These are the available paginator types: +:::note +Currently pagination is supported only for GET requests. To handle POST requests with pagination, you need to implement a [custom paginator](../../general-usage/http/rest-client.md#custom-paginator). +::: + +These are the available paginators: | Paginator class | String Alias (`type`) | Description | | -------------- | ------------ | ----------- | @@ -457,6 +463,7 @@ In the GitHub example, the `issue_comments` resource depends on the `issues` res } }, }, + "include_from_parent": ["id"], }, ], } @@ -474,6 +481,22 @@ The syntax for the `resolve` field in parameter configuration is: } ``` +#### Include fields from the parent resource + +You can include data from the parent resource in the child resource by using the `include_from_parent` field in the resource configuration. For example: + +```py +{ + "name": "issue_comments", + "endpoint": { + ... + }, + "include_from_parent": ["id", "title", "created_at"], +} +``` + +This will include the `id`, `title`, and `created_at` fields from the `issues` resource in the `issue_comments` resource data. The name of the included fields will be prefixed with the parent resource name and an underscore (`_`) like so: `_issues_id`, `_issues_title`, `_issues_created_at`. + ## Incremental loading To set up incremental loading for a resource, you can use two options: From f9d8b56f8c1a1bfe567dfad4bb74333fbd89d5e9 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 14:05:49 +0200 Subject: [PATCH 09/14] Mention auto detection --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 9ec16ebd87..3910f30dac 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -299,6 +299,7 @@ These are the available paginators: | [PageNumberPaginator](../../general-usage/http/rest-client.md#pagenumberpaginator) | `page_number` | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided. | | [JSONCursorPaginator](../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | `cursor` | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON). | | SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | +| `None` | `auto` | Explicitly specify that the source should automatically detect the pagination method. | To specify the pagination configuration, use the `paginator` field in the [client](#client) or [endpoint](#endpoint-configuration) configurations. You may use either the paginator class or the string alias in the `type` field along with the required parameters. @@ -323,6 +324,8 @@ Or using the paginator instance: } ``` +This is useful when you're [implementing and using a custom paginator](../../general-usage/http/rest-client.md#custom-paginator). + ### Data selection The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect locations of the data automatically. From bce5cc2a72de10852a93c15c624dd98b46cda207 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 15:54:56 +0200 Subject: [PATCH 10/14] Reorder the sentence about paginator types and instances --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 3910f30dac..6282bd2887 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -301,7 +301,7 @@ These are the available paginators: | SinglePagePaginator | `single_page` | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | | `None` | `auto` | Explicitly specify that the source should automatically detect the pagination method. | -To specify the pagination configuration, use the `paginator` field in the [client](#client) or [endpoint](#endpoint-configuration) configurations. You may use either the paginator class or the string alias in the `type` field along with the required parameters. +To specify the pagination configuration, use the `paginator` field in the [client](#client) or [endpoint](#endpoint-configuration) configurations. You may either use a dictionary with a string alias in the `type` field along with the required parameters, or use the paginator instance directly: ```py { From fbb32cfdf120965c1850ed4a2d1271c52f5e2248 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 16:11:00 +0200 Subject: [PATCH 11/14] Elaborate on dependent resources; link the transformer docs --- .../verified-sources/rest_api.md | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 6282bd2887..e1e271d34f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -440,7 +440,7 @@ config = { ### Define resource relationships -When you have a resource that depends on another resource, you can define the relationship using the resolve field type. +When you have a resource that depends on another resource, you can define the relationship using the `resolve` configuration. With it you link a path parameter in the child resource to a field in the parent resource's data. In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource: @@ -472,7 +472,21 @@ In the GitHub example, the `issue_comments` resource depends on the `issues` res } ``` -This configuration tells the source to get issue numbers from the `issues` resource and use them to fetch comments for each issue. +This configuration tells the source to get issue numbers from the `issues` resource and use them to fetch comments for each issue. So if the `issues` resource yields the following data: + +```json +[ + {"id": 1, "number": 123}, + {"id": 2, "number": 124}, + {"id": 3, "number": 125} +] +``` + +The `issue_comments` resource will make requests to the following endpoints: + +- `issues/123/comments` +- `issues/124/comments` +- `issues/125/comments` The syntax for the `resolve` field in parameter configuration is: @@ -484,6 +498,8 @@ The syntax for the `resolve` field in parameter configuration is: } ``` +Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). + #### Include fields from the parent resource You can include data from the parent resource in the child resource by using the `include_from_parent` field in the resource configuration. For example: From 76fde81423832c616e6f188a8ad70008736063bd Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 16:30:58 +0200 Subject: [PATCH 12/14] Link incremental loading --- .../docs/dlt-ecosystem/verified-sources/rest_api.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index e1e271d34f..c6d79f7699 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -272,12 +272,12 @@ The fields in the endpoint configuration are: - `path`: The path to the API endpoint. - `method`: The HTTP method to be used. Default is `GET`. -- `params`: Query parameters to be sent with each request. For example, `sort` to order the results. +- `params`: Query parameters to be sent with each request. For example, `sort` to order the results or `since` to specify [incremental loading](#incremental-loading). This is also used to define [resource relationships](#define-resource-relationships). - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. - `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. - `response_actions`: A list of actions that define how to process the response data. -- `incremental`: Configuration for incremental loading. +- `incremental`: Configuration for [incremental loading](#incremental-loading). ### Pagination @@ -518,7 +518,10 @@ This will include the `id`, `title`, and `created_at` fields from the `issues` r ## Incremental loading -To set up incremental loading for a resource, you can use two options: +Some APIs provide a way to fetch only new or changed data (most often by using a timestamp field like `updated_at`, `created_at`, or incremental IDs). +This is called [incremental loading](../../general-usage/incremental-loading.md) and is very useful as it allows you to reduce the load time and the amount of data transferred. + +When the API endpoint supports incremental loading, you can configure the source to load only the new or changed data using these two methods: 1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): @@ -556,6 +559,8 @@ To set up incremental loading for a resource, you can use two options: This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. +See the [incremental loading](../../general-usage/incremental-loading.md#incremental-loading-with-a-cursor-field) guide for more details. + ## Advanced configuration `rest_api_source()` function creates the [dlt source](../../general-usage/source.md) and lets you configure the following parameters: From cafa96c542d1eeb14f85bbfd4a2affddbf0a9a73 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 16:55:14 +0200 Subject: [PATCH 13/14] Update the example to use rest_api_resources --- .../docs/dlt-ecosystem/verified-sources/rest_api.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index c6d79f7699..2c0099c209 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -78,12 +78,15 @@ github_token = "your_github_token" Let's take a look at the GitHub example in `rest_api_pipeline.py` file: ```py -def load_github() -> None: +from rest_api import RESTAPIConfig, rest_api_resources + +@dlt.source +def github_source(github_token=dlt.secrets.value): github_config: RESTAPIConfig = { "client": { "base_url": "https://api.github.com/repos/dlt-hub/dlt/", "auth": { - "token": dlt.secrets["github_token"], + "token": github_token, }, }, "resource_defaults": { @@ -129,15 +132,16 @@ def load_github() -> None: ], } - github_source = rest_api_source(github_config) + yield from rest_api_resources(config) +def load_github() -> None: pipeline = dlt.pipeline( pipeline_name="rest_api_github", destination="duckdb", dataset_name="rest_api_data", ) - load_info = pipeline.run(github_source) + load_info = pipeline.run(github_source()) print(load_info) ``` From 913fa9145e61e38e161edbc1de6fcebeebf6c073 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 14 May 2024 17:01:18 +0200 Subject: [PATCH 14/14] Rename github_config --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 2c0099c209..1f79055d06 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -82,7 +82,7 @@ from rest_api import RESTAPIConfig, rest_api_resources @dlt.source def github_source(github_token=dlt.secrets.value): - github_config: RESTAPIConfig = { + config: RESTAPIConfig = { "client": { "base_url": "https://api.github.com/repos/dlt-hub/dlt/", "auth": { @@ -145,7 +145,7 @@ def load_github() -> None: print(load_info) ``` -The declarative resource configuration is defined in the `github_config` dictionary. It contains the following key components: +The declarative resource configuration is defined in the `config` dictionary. It contains the following key components: 1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file.