-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test(ingest/bigquery): Add performance testing framework for bigquery…
… usage (#7690) - Creates metadata-ingestion/tests/performance directory - Excludes metadata-ingestion/tests from docs generation - Updates bigquery reporting around project state
- Loading branch information
Showing
10 changed files
with
428 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Performance Testing | ||
This module provides a framework for performance testing our ingestion sources. | ||
|
||
When running a performance test, make sure to output print statements and live logs: | ||
```bash | ||
pytest -s --log-cli-level=INFO -m performance tests/performance/<test_name>.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Tests for performance of ingestion, not run in CI.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import dataclasses | ||
import random | ||
import uuid | ||
from collections import defaultdict | ||
from typing import Dict, Iterable, List | ||
|
||
from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( | ||
AuditEvent, | ||
BigqueryTableIdentifier, | ||
BigQueryTableRef, | ||
QueryEvent, | ||
ReadEvent, | ||
) | ||
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config | ||
from tests.performance.data_model import Query, Table | ||
|
||
# https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/BigQueryAuditMetadata.TableDataRead.Reason | ||
READ_REASONS = [ | ||
"REASON_UNSPECIFIED", | ||
"JOB", | ||
"TABLEDATA_LIST_REQUEST", | ||
"GET_QUERY_RESULTS_REQUEST", | ||
"QUERY_REQUEST", | ||
"CREATE_READ_SESSION", | ||
"MATERIALIZED_VIEW_REFRESH", | ||
] | ||
|
||
|
||
def generate_events( | ||
queries: Iterable[Query], | ||
projects: List[str], | ||
table_to_project: Dict[str, str], | ||
config: BigQueryV2Config, | ||
) -> Iterable[AuditEvent]: | ||
for query in queries: | ||
project = ( # Most queries are run in the project of the tables they access | ||
table_to_project[ | ||
query.object_modified.name | ||
if query.object_modified | ||
else query.fields_accessed[0].table.name | ||
] | ||
if random.random() >= 0.1 | ||
else random.choice(projects) | ||
) | ||
job_name = str(uuid.uuid4()) | ||
yield AuditEvent.create( | ||
QueryEvent( | ||
job_name=job_name, | ||
timestamp=query.timestamp, | ||
actor_email=query.actor, | ||
query=query.text, | ||
statementType=query.type, | ||
project_id=project, | ||
destinationTable=ref_from_table(query.object_modified, table_to_project) | ||
if query.object_modified | ||
else None, | ||
referencedTables=[ | ||
ref_from_table(field.table, table_to_project) | ||
for field in query.fields_accessed | ||
if not field.table.is_view() | ||
], | ||
referencedViews=[ | ||
ref_from_table(field.table, table_to_project) | ||
for field in query.fields_accessed | ||
if field.table.is_view() | ||
], | ||
payload=dataclasses.asdict(query) | ||
if config.debug_include_full_payloads | ||
else None, | ||
) | ||
) | ||
table_accesses = defaultdict(list) | ||
for field in query.fields_accessed: | ||
table_accesses[ref_from_table(field.table, table_to_project)].append( | ||
field.column | ||
) | ||
|
||
for ref, columns in table_accesses.items(): | ||
yield AuditEvent.create( | ||
ReadEvent( | ||
jobName=job_name, | ||
timestamp=query.timestamp, | ||
actor_email=query.actor, | ||
resource=ref, | ||
fieldsRead=columns, | ||
readReason=random.choice(READ_REASONS), | ||
payload=dataclasses.asdict(query) | ||
if config.debug_include_full_payloads | ||
else None, | ||
) | ||
) | ||
|
||
|
||
def ref_from_table(table: Table, table_to_project: Dict[str, str]) -> BigQueryTableRef: | ||
return BigQueryTableRef( | ||
BigqueryTableIdentifier( | ||
table_to_project[table.name], table.container.name, table.name | ||
) | ||
) |
160 changes: 160 additions & 0 deletions
160
metadata-ingestion/tests/performance/data_generation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
""" | ||
Generates data for performance testing of warehouse sources. | ||
In the future, we could try to create a more realistic dataset | ||
by anonymizing and reduplicating a production datahub instance's data. | ||
We could also get more human data by using Faker. | ||
This is a work in progress, built piecemeal as needed. | ||
""" | ||
import random | ||
import uuid | ||
from dataclasses import dataclass | ||
from datetime import datetime, timedelta, timezone | ||
from typing import Iterable, List, TypeVar | ||
|
||
from tests.performance.data_model import ( | ||
Container, | ||
FieldAccess, | ||
Query, | ||
StatementType, | ||
Table, | ||
View, | ||
) | ||
|
||
T = TypeVar("T") | ||
|
||
OPERATION_TYPES: List[StatementType] = [ | ||
"INSERT", | ||
"UPDATE", | ||
"DELETE", | ||
"CREATE", | ||
"ALTER", | ||
"DROP", | ||
"CUSTOM", | ||
"UNKNOWN", | ||
] | ||
|
||
|
||
@dataclass(frozen=True) | ||
class NormalDistribution: | ||
mu: float | ||
sigma: float | ||
|
||
def sample(self) -> int: | ||
return int(random.gauss(mu=self.mu, sigma=self.sigma)) | ||
|
||
def sample_with_floor(self, floor: int = 1) -> int: | ||
return max(int(random.gauss(mu=self.mu, sigma=self.sigma)), floor) | ||
|
||
|
||
@dataclass | ||
class SeedMetadata: | ||
containers: List[Container] | ||
tables: List[Table] | ||
views: List[View] | ||
start_time: datetime | ||
end_time: datetime | ||
|
||
|
||
def generate_data( | ||
num_containers: int, | ||
num_tables: int, | ||
num_views: int, | ||
columns_per_table: NormalDistribution = NormalDistribution(5, 2), | ||
parents_per_view: NormalDistribution = NormalDistribution(2, 1), | ||
view_definition_length: NormalDistribution = NormalDistribution(150, 50), | ||
time_range: timedelta = timedelta(days=14), | ||
) -> SeedMetadata: | ||
containers = [Container(f"container-{i}") for i in range(num_containers)] | ||
tables = [ | ||
Table( | ||
f"table-{i}", | ||
container=random.choice(containers), | ||
columns=[ | ||
f"column-{j}-{uuid.uuid4()}" | ||
for j in range(columns_per_table.sample_with_floor()) | ||
], | ||
) | ||
for i in range(num_tables) | ||
] | ||
views = [ | ||
View( | ||
f"view-{i}", | ||
container=random.choice(containers), | ||
columns=[ | ||
f"column-{j}-{uuid.uuid4()}" | ||
for j in range(columns_per_table.sample_with_floor()) | ||
], | ||
definition=f"{uuid.uuid4()}-{'*' * view_definition_length.sample_with_floor(10)}", | ||
parents=random.sample(tables, parents_per_view.sample_with_floor()), | ||
) | ||
for i in range(num_views) | ||
] | ||
|
||
now = datetime.now(tz=timezone.utc) | ||
return SeedMetadata( | ||
containers=containers, | ||
tables=tables, | ||
views=views, | ||
start_time=now - time_range, | ||
end_time=now, | ||
) | ||
|
||
|
||
def generate_queries( | ||
seed_metadata: SeedMetadata, | ||
num_selects: int, | ||
num_operations: int, | ||
num_users: int, | ||
tables_per_select: NormalDistribution = NormalDistribution(3, 5), | ||
columns_per_select: NormalDistribution = NormalDistribution(10, 5), | ||
upstream_tables_per_operation: NormalDistribution = NormalDistribution(2, 2), | ||
query_length: NormalDistribution = NormalDistribution(100, 50), | ||
) -> Iterable[Query]: | ||
all_tables = seed_metadata.tables + seed_metadata.views | ||
users = [f"user-{i}@xyz.com" for i in range(num_users)] | ||
for i in range(num_selects): # Pure SELECT statements | ||
tables = _sample_list(all_tables, tables_per_select) | ||
all_columns = [ | ||
FieldAccess(column, table) for table in tables for column in table.columns | ||
] | ||
yield Query( | ||
text=f"{uuid.uuid4()}-{'*' * query_length.sample_with_floor(10)}", | ||
type="SELECT", | ||
actor=random.choice(users), | ||
timestamp=_random_time_between( | ||
seed_metadata.start_time, seed_metadata.end_time | ||
), | ||
fields_accessed=_sample_list(all_columns, columns_per_select), | ||
) | ||
|
||
for i in range(num_operations): | ||
modified_table = random.choice(seed_metadata.tables) | ||
n_col = len(modified_table.columns) | ||
num_columns_modified = NormalDistribution(n_col, n_col / 2) | ||
upstream_tables = _sample_list(all_tables, upstream_tables_per_operation) | ||
|
||
all_columns = [ | ||
FieldAccess(column, table) | ||
for table in upstream_tables | ||
for column in table.columns | ||
] | ||
yield Query( | ||
text=f"{uuid.uuid4()}-{'*' * query_length.sample_with_floor(10)}", | ||
type=random.choice(OPERATION_TYPES), | ||
actor=random.choice(users), | ||
timestamp=_random_time_between( | ||
seed_metadata.start_time, seed_metadata.end_time | ||
), | ||
fields_accessed=_sample_list(all_columns, num_columns_modified), | ||
object_modified=modified_table, | ||
) | ||
|
||
|
||
def _sample_list(lst: List[T], dist: NormalDistribution) -> List[T]: | ||
return random.sample(lst, min(dist.sample_with_floor(), len(lst))) | ||
|
||
|
||
def _random_time_between(start: datetime, end: datetime) -> datetime: | ||
return start + timedelta(seconds=(end - start).total_seconds() * random.random()) |
Oops, something went wrong.