From e14dc9159c67e65c02d317f736fe7a01eb50ffc4 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Wed, 31 Jul 2024 18:45:29 +0100 Subject: [PATCH] feat(cli): Trim report of dataHubExecutionRequestResult to max GMS size (#11051) --- docs/how/updating-datahub.md | 3 ++- .../reporting/datahub_ingestion_run_summary_provider.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 2821b63e7d305a..08ababcb5cfce9 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -81,7 +81,8 @@ profiling: - #10498 - Tableau ingestion can now be configured to ingest multiple sites at once and add the sites as containers. The feature is currently only available for Tableau Server. - #10466 - Extends configuration in `~/.datahubenv` to match `DatahubClientConfig` object definition. See full configuration in https://datahubproject.io/docs/python-sdk/clients/. The CLI should now respect the updated configurations specified in `~/.datahubenv` across its functions and utilities. This means that for systems where ssl certification is disabled, setting `disable_ssl_verification: true` in `~./datahubenv` will apply to all CLI calls. - #11002 - We will not auto-generate a `~/.datahubenv` file. You must either run `datahub init` to create that file, or set environment variables so that the config is loaded. - +- #11023 - Added a new parameter to datahub's `put` cli command: `--run-id`. This parameter is useful to associate a given write to an ingestion process. A use-case can be mimick transformers when a transformer for aspect being written does not exist. +- #11051 - Ingestion reports will now trim the summary text to a maximum of 800k characters to avoid generating `dataHubExecutionRequestResult` that are too large for GMS to handle. ## 0.13.3 ### Breaking Changes diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index 2245e27ecedabf..a175870cd9fbea 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -31,6 +31,7 @@ from datahub.utilities.logging_manager import get_log_buffer from datahub.utilities.urns.urn import Urn + logger = logging.getLogger(__name__) @@ -43,6 +44,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener): _EXECUTOR_ID: str = "__datahub_cli_" _EXECUTION_REQUEST_SOURCE_TYPE: str = "CLI_INGESTION_SOURCE" _INGESTION_TASK_NAME: str = "CLI Ingestion" + _MAX_SUMMARY_SIZE: int = 800000 @staticmethod def get_cur_time_in_ms() -> int: @@ -209,7 +211,9 @@ def on_completion( status=status, startTimeMs=self.start_time_ms, durationMs=self.get_cur_time_in_ms() - self.start_time_ms, - report=summary, + # Truncate summary such that the generated MCP will not exceed GMS's payload limit. + # Hardcoding the overall size of dataHubExecutionRequestResult to >1MB by trimming summary to 800,000 chars + report=summary[-self._MAX_SUMMARY_SIZE:], structuredReport=structured_report, )