Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(protobuf): support for custom platform, subtypes, misc improvements #5973

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ project.ext.externalDependency = [
'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.10',
'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.1',
'cacheApi' : 'javax.cache:cache-api:1.1.0',
'commonsCli': 'commons-cli:commons-cli:1.5.0',
'commonsIo': 'commons-io:commons-io:2.4',
'commonsLang': 'commons-lang:commons-lang:2.6',
'commonsCollections': 'commons-collections:commons-collections:3.2.2',
Expand Down Expand Up @@ -230,6 +231,11 @@ subprojects {
}
}
} else {
tasks.withType(JavaExec).configureEach {
javaLauncher = javaToolchains.launcherFor {
languageVersion = JavaLanguageVersion.of(11)
}
}
tasks.withType(Javadoc).configureEach {
javadocTool = javaToolchains.javadocToolFor {
languageVersion = JavaLanguageVersion.of(11)
Expand Down
21 changes: 16 additions & 5 deletions docs-website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ module.exports = {
favicon: "img/favicon.ico",
organizationName: "datahub-project", // Usually your GitHub org/user name.
projectName: "datahub", // Usually your repo name.
stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;600&display=swap"],
stylesheets: [
"https://fonts.googleapis.com/css2?family=Manrope:wght@400;600&display=swap",
],
noIndex: isSaas,
customFields: {
isSaas: isSaas,
Expand Down Expand Up @@ -49,8 +51,12 @@ module.exports = {
title: null,
logo: {
alt: "DataHub Logo",
src: `img/${isSaas ? "acryl" : "datahub"}-logo-color-light-horizontal.svg`,
srcDark: `img/${isSaas ? "acryl" : "datahub"}-logo-color-dark-horizontal.svg`,
src: `img/${
isSaas ? "acryl" : "datahub"
}-logo-color-light-horizontal.svg`,
srcDark: `img/${
isSaas ? "acryl" : "datahub"
}-logo-color-dark-horizontal.svg`,
},
items: [
{
Expand Down Expand Up @@ -205,15 +211,20 @@ module.exports = {
blog: false,
theme: {
customCss: [
isSaas ? require.resolve("./src/styles/acryl.scss") : require.resolve("./src/styles/datahub.scss"),
isSaas
? require.resolve("./src/styles/acryl.scss")
: require.resolve("./src/styles/datahub.scss"),
require.resolve("./src/styles/global.scss"),
],
},
},
],
],
plugins: [
["@docusaurus/plugin-ideal-image", { quality: 100, sizes: [320, 640, 1280, 1440, 1600] }],
[
"@docusaurus/plugin-ideal-image",
{ quality: 100, sizes: [320, 640, 1280, 1440, 1600] },
],
"docusaurus-plugin-sass",
[
"docusaurus-graphql-plugin",
Expand Down
1 change: 1 addition & 0 deletions metadata-integration/java/datahub-client/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ shadowJar {
mergeServiceFiles()
// we relocate namespaces manually, because we want to know exactly which libs we are exposing and why
// we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first
relocate 'org.springframework', 'datahub.shaded.org.springframework'
relocate 'com.fasterxml.jackson', 'datahub.shaded.jackson'
relocate 'net.jcip.annotations', 'datahub.shaded.annotations'
relocate 'javassist', 'datahub.shaded.javassist'
Expand Down
6 changes: 4 additions & 2 deletions metadata-integration/java/datahub-client/scripts/check_jar.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# This script checks the shadow jar to ensure that we only have allowed classes being exposed through the jar
jarFiles=$(find build/libs -name "datahub-client*.jar" | grep -v sources | grep -v javadoc)
libName=datahub-client
jarishFile=$(find build/libs -name "${libName}*.jar" -exec ls -1rt "{}" +;)
jarFiles=$(echo "$jarishFile" | grep -v sources | grep -v javadoc | tail -n 1)
for jarFile in ${jarFiles}; do
jar -tvf $jarFile |\
grep -v "datahub/shaded" |\
Expand All @@ -14,7 +16,7 @@ jar -tvf $jarFile |\
grep -v " org/$" |\
grep -v " io/$" |\
grep -v "git.properties" |\
grep -v "org/springframework" |\
#grep -v "org/springframework" |\
grep -v "org/aopalliance" |\
grep -v "javax/" |\
grep -v "io/swagger" |\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ task publishSchema(dependsOn: build) {
javaexec {
executable = javaLauncher.get().getExecutablePath().getAsFile().getAbsolutePath()
classpath = configurations.datahub
main = "datahub.protobuf.App"
args = ["${projectDir}/build/descriptors/main.dsc", file(f).getAbsoluteFile()]
main = "datahub.protobuf.Proto2DataHub"
args = ["--descriptor", "${projectDir}/build/descriptors/main.dsc", "--file", file(f).getAbsoluteFile()]
}
}
}
Expand Down
111 changes: 107 additions & 4 deletions metadata-integration/java/datahub-protobuf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -546,13 +546,115 @@ Add the following to your `pom.xml`.
</dependency>
```

## Example Application
## Example Application (embedded)

An example application is included which works with the `protobuf-gradle-plugin`, see the standalone [example project](../datahub-protobuf-example).
An example application **Proto2DataHub** is included as part of this project.
You can also set up a standalone project that works with the `protobuf-gradle-plugin`, see the standalone [example project](../datahub-protobuf-example) as an example of such a project.

### Usage

Using the example application:
#### Standalone Application: Proto2DataHub

```
shell
java -jar build/libs/datahub-protobuf-0.8.45-SNAPSHOT.jar --help
usage: Proto2DataHub
--datahub_api <arg> [Optional] The API endpoint for DataHub GMS.
(defaults to https://localhost:8080)
--datahub_token <arg> [Optional] The authentication token for
DataHub API access. (defaults to empty)
--datahub_user <arg> [Optional] The datahub user to attribute this
ingestion to. (defaults to ..)
--descriptor <arg> [Required] The generated protobuf descriptor
file. Typically a single .dsc file for the
repo or a .protoc file (1:1 with each src
file)
--directory <arg> [Optional if using --file] The root directory
containing protobuf source files.
--env <arg> [Optional] The environment to attach all
entities to. Typically, DEV, PROD etc.
(defaults to DEV)
--exclude <arg> [Optional] Exclude patterns to avoid
processing all source files, separated by ,.
Typically used with --directory option.
Follows glob patterns: e.g. --exclude
"build/**,generated/**" will exclude all files
in the build and generated directories under
the rootDirectory given by the --directory
option
--file <arg> [Optional if using --directory] The protobuf
source file. Typically a .proto file.
--filename <arg> [Required if using transport file] Filename to
write output to.
--github_org <arg> [Optional] The GitHub organization that this
schema repository belongs to. We will
translate comments in your protoc files like
@datahub-project/data-team to GitHub team urls
like:
https://github.com/orgs/datahub-project/teams/
data-team
--help Print this help message
--platform <arg> [Optional] The data platform to produce
schemas for. e.g. kafka, snowflake, etc.
(defaults to kafka)
--slack_id <arg> [Optional] The Slack team id if your protobuf
files contain comments with references to
channel names. We will translate comments like
#data-eng in your protobuf file to slack urls
like:
https://slack.com/app_redirect?channel=data-en
g&team=T1234 following the documentation at
(https://api.slack.com/reference/deep-linking#
deep-linking-into-your-slack-app__opening-a-ch
annel-by-name-or-id) The easiest way to find
your Slack team id is to open your workspace
in your browser. It should look something
like:
https://app.slack.com/client/TUMKD5EGJ/... In
this case, the team-id is TUMKD5EGJ.
--subtype [Optional] A custom subtype to attach to all
entities produced. e.g. event, schema, topic
etc.(Default is schema)
--transport <arg> [Optional] What transport to use to
communicate with DataHub. Options are: rest
(default), kafka and file.
```

You can run it like a standard java jar application:
```shell

java -jar build/libs/datahub-protobuf-0.8.45-SNAPSHOT.jar --descriptor ../datahub-protobuf-example/build/descriptors/main.dsc --directory ../datahub-protobuf-example/schema/protobuf/v1/clickstream/ --transport rest
```

or using gradle
```shell
../../../gradlew run --args="--descriptor ../datahub-protobuf-example/build/descriptors/main.dsc --directory ../datahub-protobuf-example/schema/protobuf/v1/clickstream/ --transport rest"
```

Result:
```
java -jar build/libs/datahub-protobuf-0.8.45-SNAPSHOT.jar --descriptor ../datahub-protobuf-example/build/descriptors/main.dsc --directory ../datahub-protobuf-example/schema/protobuf/v1/clickstream/ --transport rest
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
✅ Successfully emitted 90 events for 5 files to DataHub REST
```

You can also route results to a file by using the `--transport file --filename events.json` options.

##### Important Flags
Here are a few important flags to use with this command
- --env : Defaults to DEV, you should use PROD once you have ironed out all the issues with running this command.
- --platform: Defaults to Kafka (as most people use protobuf schema repos with Kafka), but you can provide a custom platform name for this e.g. (`schema_repo` or `<company_name>_schemas`). If you use a custom platform, make sure to provision the custom platform on your DataHub instance with a logo etc, to get a native experience.
- --subtype : This gives your entities a more descriptive category than Dataset in the UI. Defaults to schema, but you might find topic, event or message more descriptive.



## Example Application (separate project)

The standalone [example project](../datahub-protobuf-example) shows you how you can create an independent project that uses this as part of a build task.

### Sample Usage:

```shell
export DATAHUB_API=...
Expand All @@ -563,5 +665,6 @@ export DATAHUB_TOKEN=...
# export DATAHUB_GITHUBORG=datahub-project
# export DATAHUB_SLACKID=

# publishSchema task will publish all the protobuf files into DataHub
./gradlew publishSchema
```
```
11 changes: 11 additions & 0 deletions metadata-integration/java/datahub-protobuf/build.gradle
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
plugins {
id("com.palantir.git-version") apply false
id "application"
}
apply plugin: 'java'
apply plugin: 'jacoco'
Expand All @@ -17,6 +18,13 @@ afterEvaluate {
targetCompatibility = 11
}
}
ext {
javaMainClass = "datahub.protobuf.Proto2DataHub"
}

application {
mainClassName = javaMainClass
}

dependencies {
implementation project(':metadata-models')
Expand All @@ -25,6 +33,7 @@ dependencies {
implementation externalDependency.protobuf
implementation externalDependency.jgrapht
implementation externalDependency.gson
implementation externalDependency.commonsCli

compileOnly externalDependency.lombok
annotationProcessor externalDependency.lombok
Expand Down Expand Up @@ -198,3 +207,5 @@ nexusStaging {
username = System.getenv("NEXUS_USERNAME")
password = System.getenv("NEXUS_PASSWORD")
}


Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# This script checks the shadow jar to ensure that we only have allowed classes being exposed through the jar
jarFiles=$(find build/libs -name "datahub-protobuf*.jar" | grep -v sources | grep -v javadoc)
libName=datahub-protobuf
jarishFile=$(find build/libs -name "${libName}*.jar" -exec ls -1rt "{}" +;)
jarFiles=$(echo "$jarishFile" | grep -v sources | grep -v javadoc | tail -n 1)
for jarFile in ${jarFiles}; do
jar -tvf $jarFile |\
grep -v "datahub/shaded" |\
Expand All @@ -18,7 +20,6 @@ jar -tvf $jarFile |\
grep -v " org/$" |\
grep -v " io/$" |\
grep -v "git.properties" |\
grep -v "org/springframework" |\
grep -v "org/aopalliance" |\
grep -v "javax/" |\
grep -v "io/swagger" |\
Expand Down

This file was deleted.

Loading