-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
…hub-project#10433) - Use Openlineage 1.13.1 in Spark Plugin - Add retry option to datahub client and Spark Plugin - Add OpenLineage integration doc
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# OpenLineage | ||
|
||
DataHub, now supports [OpenLineage](https://openlineage.io/) integration. With this support, DataHub can ingest and display lineage information from various data processing frameworks, providing users with a comprehensive understanding of their data pipelines. | ||
|
||
## Features | ||
|
||
- **REST Endpoint Support**: DataHub now includes a REST endpoint that can understand OpenLineage events. This allows users to send lineage information directly to DataHub, enabling easy integration with various data processing frameworks. | ||
|
||
- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more. | ||
|
||
## OpenLineage Support with DataHub | ||
|
||
### 1. REST Endpoint Support | ||
|
||
DataHub's REST endpoint allows users to send OpenLineage events directly to DataHub. This enables easy integration with various data processing frameworks, providing users with a centralized location for viewing and managing data lineage information. | ||
|
||
With Spark and Airflow we recommend using the Spark Lineage or DataHub's Airflow plugin for tighter integration with DataHub. | ||
|
||
#### How to Use | ||
|
||
To send OpenLineage messages to DataHub using the REST endpoint, simply make a POST request to the following endpoint: | ||
|
||
``` | ||
POST GMS_SERVER_HOST:GMS_PORT/api/v2/lineage | ||
``` | ||
|
||
Include the OpenLineage message in the request body in JSON format. | ||
|
||
Example: | ||
|
||
```json | ||
{ | ||
"eventType": "START", | ||
"eventTime": "2020-12-28T19:52:00.001+10:00", | ||
"run": { | ||
"runId": "d46e465b-d358-4d32-83d4-df660ff614dd" | ||
}, | ||
"job": { | ||
"namespace": "workshop", | ||
"name": "process_taxes" | ||
}, | ||
"inputs": [ | ||
{ | ||
"namespace": "postgres://workshop-db:None", | ||
"name": "workshop.public.taxes", | ||
"facets": { | ||
"dataSource": { | ||
"_producer": "https://github.com/OpenLineage/OpenLineage/tree/0.10.0/integration/airflow", | ||
"_schemaURL": "https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/DataSourceDatasetFacet", | ||
"name": "postgres://workshop-db:None", | ||
"uri": "workshop-db" | ||
} | ||
} | ||
} | ||
], | ||
"producer": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client" | ||
} | ||
``` | ||
##### How to set up Airflow | ||
Follow the Airflow guide to setup the Airflow DAGs to send lineage information to DataHub. The guide can be found [here](https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/guides/user.html | ||
The transport should look like this: | ||
```json | ||
{"type": "http", | ||
"url": "https://GMS_SERVER_HOST:GMS_PORT/openapi/openlineage/", | ||
"endpoint": "api/v1/lineage", | ||
"auth": { | ||
"type": "api_key", | ||
"api_key": "your-datahub-api-key" | ||
} | ||
} | ||
``` | ||
|
||
#### Known Limitations | ||
With Spark and Airflow we recommend using the Spark Lineage or DataHub's Airflow plugin for tighter integration with DataHub. | ||
|
||
- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns)) support is not yet available. | ||
|
||
- **Column-level Lineage**: DataHub's current OpenLineage support does not provide full column-level lineage tracking. | ||
- etc... | ||
### 2. Spark Event Listener Plugin | ||
|
||
DataHub's Spark Event Listener plugin enhances OpenLineage support by providing additional features such as PathSpec support, column-level lineage, and more. | ||
|
||
#### How to Use | ||
|
||
Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta) | ||
|
||
## References | ||
|
||
- [OpenLineage](https://openlineage.io/) | ||
- [DataHub OpenAPI Guide](../api/openapi/openapi-usage-guide.md) | ||
- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package datahub.client.rest; | ||
|
||
import java.io.IOException; | ||
import java.io.InterruptedIOException; | ||
import java.net.ConnectException; | ||
import java.net.NoRouteToHostException; | ||
import java.net.UnknownHostException; | ||
import java.util.Arrays; | ||
import javax.net.ssl.SSLException; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.hc.client5.http.impl.DefaultHttpRequestRetryStrategy; | ||
import org.apache.hc.core5.http.ConnectionClosedException; | ||
import org.apache.hc.core5.http.HttpRequest; | ||
import org.apache.hc.core5.http.HttpResponse; | ||
import org.apache.hc.core5.http.HttpStatus; | ||
import org.apache.hc.core5.http.protocol.HttpContext; | ||
import org.apache.hc.core5.util.TimeValue; | ||
|
||
@Slf4j | ||
public class DatahubHttpRequestRetryStrategy extends DefaultHttpRequestRetryStrategy { | ||
public DatahubHttpRequestRetryStrategy() { | ||
this(1, TimeValue.ofSeconds(10)); | ||
} | ||
|
||
public DatahubHttpRequestRetryStrategy(int maxRetries, TimeValue retryInterval) { | ||
super( | ||
maxRetries, | ||
retryInterval, | ||
Arrays.asList( | ||
InterruptedIOException.class, | ||
UnknownHostException.class, | ||
ConnectException.class, | ||
ConnectionClosedException.class, | ||
NoRouteToHostException.class, | ||
SSLException.class), | ||
Arrays.asList( | ||
HttpStatus.SC_TOO_MANY_REQUESTS, | ||
HttpStatus.SC_SERVICE_UNAVAILABLE, | ||
HttpStatus.SC_INTERNAL_SERVER_ERROR)); | ||
} | ||
|
||
@Override | ||
public boolean retryRequest( | ||
HttpRequest request, IOException exception, int execCount, HttpContext context) { | ||
log.warn("Checking if retry is needed: {}", execCount); | ||
return super.retryRequest(request, exception, execCount, context); | ||
} | ||
|
||
@Override | ||
public boolean retryRequest(HttpResponse response, int execCount, HttpContext context) { | ||
log.warn("Retrying request due to error: {}", response); | ||
return super.retryRequest(response, execCount, context); | ||
} | ||
} |
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
/* | ||
/* Copyright 2018-2024 contributors to the OpenLineage project | ||
/* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package io.openlineage.spark.agent.util; | ||
|
||
import com.typesafe.config.Config; | ||
import com.typesafe.config.ConfigFactory; | ||
import datahub.spark.conf.SparkAppContext; | ||
import datahub.spark.conf.SparkConfigParser; | ||
import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; | ||
import io.datahubproject.openlineage.dataset.HdfsPathDataset; | ||
import io.openlineage.client.OpenLineage.InputDataset; | ||
import io.openlineage.client.OpenLineage.OutputDataset; | ||
import io.openlineage.spark.api.OpenLineageContext; | ||
import java.net.URI; | ||
import java.net.URISyntaxException; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.regex.Pattern; | ||
import java.util.stream.Collectors; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.commons.lang.StringUtils; | ||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.sql.SparkSession; | ||
|
||
/** | ||
* Utility class to handle removing path patterns in dataset names. Given a configured regex pattern | ||
* with "remove" group defined, class methods run regex replacements on all the datasets available | ||
* within the event | ||
*/ | ||
@Slf4j | ||
public class RemovePathPatternUtils { | ||
public static final String REMOVE_PATTERN_GROUP = "remove"; | ||
public static final String SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN = | ||
"spark.openlineage.dataset.removePath.pattern"; | ||
|
||
private static Optional<SparkConf> sparkConf = Optional.empty(); | ||
|
||
public static List<OutputDataset> removeOutputsPathPattern_ol( | ||
OpenLineageContext context, List<OutputDataset> outputs) { | ||
return getPattern(context) | ||
.map( | ||
pattern -> | ||
outputs.stream() | ||
.map( | ||
dataset -> { | ||
String newName = removePath(pattern, dataset.getName()); | ||
if (newName != dataset.getName()) { | ||
Check warning on line 51 in metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/RemovePathPatternUtils.java
|
||
return context | ||
.getOpenLineage() | ||
.newOutputDatasetBuilder() | ||
.name(removePath(pattern, dataset.getName())) | ||
.namespace(dataset.getNamespace()) | ||
.facets(dataset.getFacets()) | ||
.outputFacets(dataset.getOutputFacets()) | ||
.build(); | ||
} else { | ||
return dataset; | ||
} | ||
}) | ||
.collect(Collectors.toList())) | ||
.orElse(outputs); | ||
} | ||
|
||
// This method was replaced to support Datahub PathSpecs | ||
public static List<OutputDataset> removeOutputsPathPattern( | ||
OpenLineageContext context, List<OutputDataset> outputs) { | ||
return outputs.stream() | ||
.map( | ||
dataset -> { | ||
String newName = removePathPattern(dataset.getName()); | ||
if (newName != dataset.getName()) { | ||
Check warning on line 75 in metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/RemovePathPatternUtils.java
|
||
return context | ||
.getOpenLineage() | ||
.newOutputDatasetBuilder() | ||
.name(newName) | ||
.namespace(dataset.getNamespace()) | ||
.facets(dataset.getFacets()) | ||
.outputFacets(dataset.getOutputFacets()) | ||
.build(); | ||
} else { | ||
return dataset; | ||
} | ||
}) | ||
.collect(Collectors.toList()); | ||
} | ||
|
||
// This method was replaced to support Datahub PathSpecs | ||
public static List<InputDataset> removeInputsPathPattern( | ||
OpenLineageContext context, List<InputDataset> inputs) { | ||
return inputs.stream() | ||
.map( | ||
dataset -> { | ||
String newName = removePathPattern(dataset.getName()); | ||
if (newName != dataset.getName()) { | ||
Check warning on line 98 in metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/RemovePathPatternUtils.java
|
||
return context | ||
.getOpenLineage() | ||
.newInputDatasetBuilder() | ||
.name(newName) | ||
.namespace(dataset.getNamespace()) | ||
.facets(dataset.getFacets()) | ||
.inputFacets(dataset.getInputFacets()) | ||
.build(); | ||
} else { | ||
return dataset; | ||
} | ||
}) | ||
.collect(Collectors.toList()); | ||
} | ||
|
||
private static Optional<Pattern> getPattern(OpenLineageContext context) { | ||
return Optional.ofNullable(context.getSparkContext()) | ||
.map(sparkContext -> sparkContext.conf()) | ||
.filter(conf -> conf.contains(SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN)) | ||
.map(conf -> conf.get(SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN)) | ||
.map(pattern -> Pattern.compile(pattern)); | ||
} | ||
|
||
private static String removePath(Pattern pattern, String name) { | ||
return Optional.ofNullable(pattern.matcher(name)) | ||
.filter(matcher -> matcher.find()) | ||
.filter( | ||
matcher -> { | ||
try { | ||
matcher.group(REMOVE_PATTERN_GROUP); | ||
return true; | ||
} catch (IllegalStateException | IllegalArgumentException e) { | ||
return false; | ||
} | ||
}) | ||
.filter(matcher -> StringUtils.isNotEmpty(matcher.group(REMOVE_PATTERN_GROUP))) | ||
.map( | ||
matcher -> | ||
name.substring(0, matcher.start(REMOVE_PATTERN_GROUP)) | ||
+ name.substring(matcher.end(REMOVE_PATTERN_GROUP), name.length())) | ||
.orElse(name); | ||
} | ||
|
||
/** | ||
* SparkConf does not change through job lifetime but it can get lost once session is closed. It's | ||
* good to have it set in case of SPARK-29046 | ||
*/ | ||
private static Optional<SparkConf> loadSparkConf() { | ||
if (!sparkConf.isPresent() && SparkSession.getDefaultSession().isDefined()) { | ||
sparkConf = Optional.of(SparkSession.getDefaultSession().get().sparkContext().getConf()); | ||
} | ||
return sparkConf; | ||
} | ||
|
||
private static String removePathPattern(String datasetName) { | ||
// TODO: The reliance on global-mutable state here should be changed | ||
// this led to problems in the PathUtilsTest class, where some tests interfered with others | ||
log.info("Removing path pattern from dataset name {}", datasetName); | ||
Optional<SparkConf> conf = loadSparkConf(); | ||
if (!conf.isPresent()) { | ||
return datasetName; | ||
} | ||
try { | ||
String propertiesString = | ||
Arrays.stream(conf.get().getAllWithPrefix("spark.datahub.")) | ||
.map(tup -> tup._1 + "= \"" + tup._2 + "\"") | ||
.collect(Collectors.joining("\n")); | ||
Config datahubConfig = ConfigFactory.parseString(propertiesString); | ||
DatahubOpenlineageConfig datahubOpenlineageConfig = | ||
SparkConfigParser.sparkConfigToDatahubOpenlineageConf( | ||
datahubConfig, new SparkAppContext()); | ||
HdfsPathDataset hdfsPath = | ||
HdfsPathDataset.create(new URI(datasetName), datahubOpenlineageConfig); | ||
log.debug("Transformed path is {}", hdfsPath.getDatasetPath()); | ||
return hdfsPath.getDatasetPath(); | ||
} catch (InstantiationException e) { | ||
log.warn( | ||
"Unable to convert dataset {} to path the exception was {}", datasetName, e.getMessage()); | ||
return datasetName; | ||
} catch (URISyntaxException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} |