From a8ac5f9e33b4d8a867e36dfc49b26583f673fe5c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 14 Sep 2022 02:14:22 -0700 Subject: [PATCH 1/7] refactor(ingest): simplify DefaultSQLParser alias (#5935) --- .../src/datahub/utilities/sql_parser.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py index 28b5082ccbb3b2..12bd7e5f40bfe5 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_parser.py +++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py @@ -143,15 +143,4 @@ def get_columns(self) -> List[str]: return self.columns -class DefaultSQLParser(SQLParser): - parser: SQLParser - - def __init__(self, sql_query: str) -> None: - super().__init__(sql_query) - self.parser = SqlLineageSQLParser(sql_query) - - def get_tables(self) -> List[str]: - return self.parser.get_tables() - - def get_columns(self) -> List[str]: - return self.parser.get_columns() +DefaultSQLParser = SqlLineageSQLParser From e3f8f29a316ee33fa5c04390e69d594f7db1de95 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 14 Sep 2022 02:15:01 -0700 Subject: [PATCH 2/7] feat(ingest): support version option in datahub client get_aspect_v2 (#5934) --- metadata-ingestion/src/datahub/ingestion/graph/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index a56d183f0afc20..1df6e92e10a5bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -142,6 +142,7 @@ def get_aspect_v2( aspect_type: Type[Aspect], aspect: str, aspect_type_name: Optional[str] = None, + version: int = 0, ) -> Optional[Aspect]: """ Get an aspect for an entity. @@ -150,11 +151,12 @@ def get_aspect_v2( :param Type[Aspect] aspect_type: The type class of the aspect being requested (e.g. datahub.metadata.schema_classes.DatasetProperties) :param str aspect: The name of the aspect being requested (e.g. schemaMetadata, datasetProperties, etc.) :param Optional[str] aspect_type_name: The fully qualified classname of the aspect being requested. Typically not needed and extracted automatically from the class directly. (e.g. com.linkedin.common.DatasetProperties) + :param version: The version of the aspect to retrieve. The default of 0 means latest. Versions > 0 go from oldest to newest, so 1 is the oldest. :return: the Aspect as a dictionary if present, None if no aspect was found (HTTP status 404) :rtype: Optional[Aspect] :raises HttpError: if the HTTP response is not a 200 or a 404 """ - url: str = f"{self._gms_server}/aspects/{Urn.url_encode(entity_urn)}?aspect={aspect}&version=0" + url: str = f"{self._gms_server}/aspects/{Urn.url_encode(entity_urn)}?aspect={aspect}&version={version}" response = self._session.get(url) if response.status_code == 404: # not found From 68ea92677046d9d068a73c27d9deb593573fffe3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 14 Sep 2022 02:16:21 -0700 Subject: [PATCH 3/7] fix(ingest): add dbt redshift type mappings (#5933) --- metadata-ingestion/src/datahub/ingestion/source/dbt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt.py b/metadata-ingestion/src/datahub/ingestion/source/dbt.py index 8ea0be64d3356f..50319c0c4f227a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt.py @@ -733,7 +733,8 @@ def get_column_type( # resolve modified type if dbt_adapter == "trino": TypeClass = resolve_trino_modified_type(column_type) - elif dbt_adapter == "postgres": + elif dbt_adapter == "postgres" or dbt_adapter == "redshift": + # Redshift uses a variant of Postgres, so we can use the same logic. TypeClass = resolve_postgres_modified_type(column_type) # if still not found, report the warning From ff04e0e0bd44aa27a3eeceab0e5f47e872d154fd Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 14 Sep 2022 14:46:41 +0530 Subject: [PATCH 4/7] fix(ci): process older issues first (#5926) --- .github/workflows/close-stale-issues.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml index efc96c25fe9661..87dea7b594d08a 100644 --- a/.github/workflows/close-stale-issues.yml +++ b/.github/workflows/close-stale-issues.yml @@ -12,6 +12,8 @@ jobs: steps: - uses: actions/stale@v5 with: + ascending: true + operations-per-run: 100 days-before-issue-stale: 30 days-before-issue-close: 30 stale-issue-label: "stale" From d5d8a38caa530348d24cd95a93721357e8a5171a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 14 Sep 2022 02:17:07 -0700 Subject: [PATCH 5/7] fix(ingest): move git requirement into lookml deps (#5932) --- metadata-ingestion/setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 918ab6cf2d7325..a4d23a01a9c235 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -59,7 +59,6 @@ def get_long_description(): "cached_property", "ijson", "click-spinner", - "GitPython>2", } kafka_common = { @@ -270,7 +269,7 @@ def get_long_description(): "looker": looker_common, # lkml>=1.1.2 is required to support the sql_preamble expression in LookML "lookml": looker_common - | {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.5"}, + | {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.5", "GitPython>2"}, "metabase": {"requests", "sqllineage==1.3.5"}, "mode": {"requests", "sqllineage==1.3.5", "tenacity>=8.0.1"}, "mongodb": {"pymongo[srv]>=3.11", "packaging"}, From f669ba6f14f2b754a84a07eefc000aade77d608e Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 14 Sep 2022 17:35:26 +0530 Subject: [PATCH 6/7] feat(elastic-setup): more verbose logging (#5937) --- docker/elasticsearch-setup/create-indices.sh | 35 +++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/docker/elasticsearch-setup/create-indices.sh b/docker/elasticsearch-setup/create-indices.sh index 94fe86e722014c..e58bad4349fce1 100755 --- a/docker/elasticsearch-setup/create-indices.sh +++ b/docker/elasticsearch-setup/create-indices.sh @@ -10,6 +10,7 @@ if [[ $ELASTICSEARCH_USE_SSL == true ]]; then else ELASTICSEARCH_PROTOCOL=http fi +echo -e "Going to use $ELASTICSEARCH_PROTOCOL" if [[ ! -z $ELASTICSEARCH_USERNAME ]] && [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then AUTH_TOKEN=$(echo -ne "$ELASTICSEARCH_USERNAME:$ELASTICSEARCH_PASSWORD" | base64 --wrap 0) @@ -18,31 +19,49 @@ fi # Add default header if needed if [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then + echo -e "Going to use default elastic headers" ELASTICSEARCH_AUTH_HEADER="Accept: */*" fi function create_datahub_usage_event_datastream() { + echo -e "Going to use prefix $INDEX_PREFIX" if [[ -z "$INDEX_PREFIX" ]]; then PREFIX='' else PREFIX="${INDEX_PREFIX}_" fi - if [ $(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/${PREFIX}datahub_usage_event_policy") -eq 404 ] + POLICY_RESPONSE_CODE=$(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/${PREFIX}datahub_usage_event_policy") + echo -e "Policy GET response code is $POLICY_RESPONSE_CODE" + if [ $POLICY_RESPONSE_CODE -eq 403 ] then - echo -e "\ncreating datahub_usage_event_policy" + echo -e "Forbidden so exiting" + exit 1 + fi + POLICY_NAME="${PREFIX}datahub_usage_event_policy" + if [ $POLICY_RESPONSE_CODE -eq 404 ] + then + echo -e "\ncreating $POLICY_NAME" sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/policy.json | tee -a /tmp/policy.json - curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/${PREFIX}datahub_usage_event_policy" -H 'Content-Type: application/json' --data @/tmp/policy.json + curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/${POLICY_NAME}" -H 'Content-Type: application/json' --data @/tmp/policy.json else - echo -e "\ndatahub_usage_event_policy exists" + echo -e "\n${POLICY_NAME} exists" fi - if [ $(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/${PREFIX}datahub_usage_event_index_template") -eq 404 ] + TEMPLATE_RESPONSE_CODE=$(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/${PREFIX}datahub_usage_event_index_template") + echo -e "Template GET response code is $TEMPLATE_RESPONSE_CODE" + if [ $TEMPLATE_RESPONSE_CODE -eq 403 ] then - echo -e "\ncreating datahub_usage_event_index_template" + echo -e "Forbidden so exiting" + exit 1 + fi + TEMPLATE_NAME="${PREFIX}datahub_usage_event_index_template" + if [ $TEMPLATE_RESPONSE_CODE -eq 404 ] + then + echo -e "\ncreating $TEMPLATE_NAME" sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/index_template.json | tee -a /tmp/index_template.json - curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/${PREFIX}datahub_usage_event_index_template" -H 'Content-Type: application/json' --data @/tmp/index_template.json + curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/$TEMPLATE_NAME" -H 'Content-Type: application/json' --data @/tmp/index_template.json else - echo -e "\ndatahub_usage_event_index_template exists" + echo -e "\n$TEMPLATE_NAME exists" fi } From c606abdb4033b3a88059da6a94c1ee043de4db7d Mon Sep 17 00:00:00 2001 From: firas omrane Date: Wed, 14 Sep 2022 14:26:42 +0200 Subject: [PATCH 7/7] fix(docker) Add platform to docker-compose command (#5683) --- docker/dev-with-cassandra.sh | 4 +-- docker/dev-without-neo4j.sh | 4 +-- docker/dev.sh | 4 +-- docker/quickstart.sh | 2 +- .../datahub/cli/{docker.py => docker_cli.py} | 27 ++++++++++++++++++- metadata-ingestion/src/datahub/entrypoints.py | 2 +- smoke-test/tests/utils.py | 2 +- 7 files changed, 35 insertions(+), 10 deletions(-) rename metadata-ingestion/src/datahub/cli/{docker.py => docker_cli.py} (97%) diff --git a/docker/dev-with-cassandra.sh b/docker/dev-with-cassandra.sh index 580816fc92b441..f71d91de190807 100755 --- a/docker/dev-with-cassandra.sh +++ b/docker/dev-with-cassandra.sh @@ -23,13 +23,13 @@ fi # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd $DIR && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \ -f docker-compose-with-cassandra.yml \ -f docker-compose.dev.yml \ $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \ pull \ && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \ -f docker-compose-with-cassandra.yml \ -f docker-compose.dev.yml \ $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \ diff --git a/docker/dev-without-neo4j.sh b/docker/dev-without-neo4j.sh index 0d8e0661fbfcfe..7fa55bb019adb7 100755 --- a/docker/dev-without-neo4j.sh +++ b/docker/dev-without-neo4j.sh @@ -22,13 +22,13 @@ fi # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd $DIR && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \ -f docker-compose-without-neo4j.yml \ -f docker-compose-without-neo4j.override.yml \ -f docker-compose.dev.yml \ $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \ && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \ -f docker-compose-without-neo4j.yml \ -f docker-compose-without-neo4j.override.yml \ -f docker-compose.dev.yml \ diff --git a/docker/dev.sh b/docker/dev.sh index 1af45c0c6faafd..9f7fafdaf3d5ec 100755 --- a/docker/dev.sh +++ b/docker/dev.sh @@ -23,13 +23,13 @@ fi # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd $DIR && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \ -f docker-compose.yml \ -f docker-compose.override.yml \ -f docker-compose.dev.yml \ $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \ && \ - COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub \ + COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \ -f docker-compose.yml \ -f docker-compose.override.yml \ -f docker-compose.dev.yml \ diff --git a/docker/quickstart.sh b/docker/quickstart.sh index f7f14d8af9c226..a7eadf18bcb664 100755 --- a/docker/quickstart.sh +++ b/docker/quickstart.sh @@ -37,7 +37,7 @@ then else echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service" cd $DIR && \ - docker-compose -p datahub \ + DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \ -f quickstart/docker-compose-without-neo4j.quickstart.yml \ $MONITORING_COMPOSE $CONSUMERS_COMPOSE $M1_COMPOSE up $@ fi diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker_cli.py similarity index 97% rename from metadata-ingestion/src/datahub/cli/docker.py rename to metadata-ingestion/src/datahub/cli/docker_cli.py index f5cec600136a36..950d7d555e8ba8 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -56,6 +56,12 @@ GITHUB_M1_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{M1_QUICKSTART_COMPOSE_FILE}" GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" +DOCKER_COMPOSE_PLATFORM = ( + subprocess.run(["uname", "-m"], stdout=subprocess.PIPE) + .stdout.decode("utf-8") + .rstrip() +) + @click.group() def docker() -> None: @@ -208,6 +214,10 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None: subprocess.run( [*base_command, "stop"], check=True, + env={ + **os.environ, + "DOCKER_DEFAULT_PLATFORM": DOCKER_COMPOSE_PLATFORM, + }, ) click.secho("Stopped datahub successfully.", fg="green") except subprocess.CalledProcessError: @@ -638,6 +648,10 @@ def quickstart( subprocess.run( [*base_command, "pull", "-q"], check=True, + env={ + **os.environ, + "DOCKER_DEFAULT_PLATFORM": DOCKER_COMPOSE_PLATFORM, + }, ) click.secho("Finished pulling docker images!") except subprocess.CalledProcessError: @@ -659,6 +673,7 @@ def quickstart( check=True, env={ **os.environ, + "DOCKER_DEFAULT_PLATFORM": DOCKER_COMPOSE_PLATFORM, "DOCKER_BUILDKIT": "1", }, ) @@ -674,7 +689,13 @@ def quickstart( # Attempt to run docker compose up every minute. if (datetime.datetime.now() - start_time) > up_attempts * up_interval: click.echo() - subprocess.run(base_command + ["up", "-d", "--remove-orphans"]) + subprocess.run( + base_command + ["up", "-d", "--remove-orphans"], + env={ + **os.environ, + "DOCKER_DEFAULT_PLATFORM": DOCKER_COMPOSE_PLATFORM, + }, + ) up_attempts += 1 # Check docker health every few seconds. @@ -694,6 +715,10 @@ def quickstart( stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True, + env={ + **os.environ, + "DOCKER_DEFAULT_PLATFORM": DOCKER_COMPOSE_PLATFORM, + }, ) log_file.write(ret.stdout) diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 8492eb4c20112a..8e917d3f394f44 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -11,7 +11,7 @@ from datahub.cli.check_cli import check from datahub.cli.cli_utils import DATAHUB_CONFIG_PATH, write_datahub_config from datahub.cli.delete_cli import delete -from datahub.cli.docker import docker +from datahub.cli.docker_cli import docker from datahub.cli.get_cli import get from datahub.cli.ingest_cli import ingest from datahub.cli.migrate import migrate diff --git a/smoke-test/tests/utils.py b/smoke-test/tests/utils.py index b0588d3bd7219d..490f258558a674 100644 --- a/smoke-test/tests/utils.py +++ b/smoke-test/tests/utils.py @@ -6,7 +6,7 @@ import requests from datahub.cli import cli_utils -from datahub.cli.docker import check_local_docker_containers +from datahub.cli.docker_cli import check_local_docker_containers from datahub.ingestion.run.pipeline import Pipeline