Skip to content

Commit

Permalink
feat: enhance glue extractor (#306)
Browse files Browse the repository at this point in the history
* Add `is_view`

* Update glue table description
For Athena tables, the table description is stored in Parameters.comment.

* fix missing partition keys
Partition keys should be part of the columns.

* Add tests

* fix typo

* fix flake8
  • Loading branch information
ckljohn authored Jul 28, 2020
1 parent 4b7b147 commit faa795c
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 16 deletions.
12 changes: 7 additions & 5 deletions databuilder/extractor/glue_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,26 @@ def _get_extract_iter(self):
:return:
"""
for row in self._get_raw_extract_iter():
columns = []
columns, i = [], 0

for i in range(len(row['StorageDescriptor']['Columns'])):
column = row['StorageDescriptor']['Columns'][i]
for column in row['StorageDescriptor']['Columns'] \
+ row.get('PartitionKeys', []):
columns.append(ColumnMetadata(
column['Name'],
column['Comment'] if 'Comment' in column else None,
column['Type'],
i
))
i += 1

yield TableMetadata(
'glue',
self._cluster,
row['DatabaseName'],
row['Name'],
row['Description'] if 'Description' in row else None,
columns
row.get('Description') or row.get('Parameters', {}).get('comment'),
columns,
row.get('TableType') == 'VIRTUAL_VIEW',
)

def _get_raw_extract_iter(self):
Expand Down
71 changes: 60 additions & 11 deletions tests/unit/extractor/test_glue_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,15 @@ def test_extraction_with_single_result(self):
'Type': 'varchar'
}
]
}
},
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
}
]

Expand All @@ -85,7 +93,9 @@ def test_extraction_with_single_result(self):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), actual.__repr__())
self.assertIsNone(extractor.extract())

Expand Down Expand Up @@ -128,7 +138,15 @@ def test_extraction_with_multiple_result(self):
'Type': 'varchar'
}
]
}
},
'PartitionKeys': [
{
'Name': 'partition_key1',
'Type': 'string',
'Comment': 'description of partition_key1'
},
],
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_table2',
Expand All @@ -147,12 +165,12 @@ def test_extraction_with_multiple_result(self):
'Comment': 'description of col_name2'
}
]
}
},
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_table3',
'DatabaseName': 'test_schema2',
'Description': 'test table 3',
'StorageDescriptor': {
'Columns': [
{
Expand All @@ -166,8 +184,30 @@ def test_extraction_with_multiple_result(self):
'Comment': 'description of col_name3'
}
]
}
}
},
'Parameters': {'comment': 'description of test table 3 from comment'},
'TableType': 'EXTERNAL_TABLE',
},
{
'Name': 'test_view1',
'DatabaseName': 'test_schema1',
'Description': 'test view 1',
'StorageDescriptor': {
'Columns': [
{
'Name': 'col_id3',
'Type': 'varchar',
'Comment': 'description of col_id3'
},
{
'Name': 'col_name3',
'Type': 'varchar',
'Comment': 'description of col_name3'
}
]
},
'TableType': 'VIRTUAL_VIEW',
},
]

extractor = GlueExtractor()
Expand All @@ -179,18 +219,27 @@ def test_extraction_with_multiple_result(self):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5),
ColumnMetadata('partition_key1', 'description of partition_key1', 'string', 6),
], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
[ColumnMetadata('col_name', 'description of col_name', 'varchar', 0),
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)])
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3',
'description of test table 3 from comment',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)], False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

expected = TableMetadata('glue', 'gold', 'test_schema2', 'test_table3', 'test table 3',
expected = TableMetadata('glue', 'gold', 'test_schema1', 'test_view1', 'test view 1',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)])
'varchar', 1)], True)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

self.assertIsNone(extractor.extract())
Expand Down

0 comments on commit faa795c

Please sign in to comment.