Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 134 add metadata validation #145

Merged
merged 6 commits into from
Jan 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,5 @@ ENV/

# Vim
.*.swp

sdv/data/
244 changes: 244 additions & 0 deletions examples/6. Metadata Validation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sdv import load_demo"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"metadata, tables = load_demo(metadata=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'tables': {'users': {'primary_key': 'user_id',\n",
" 'fields': {'user_id': {'type': 'id', 'subtype': 'integer'},\n",
" 'country': {'type': 'categorical'},\n",
" 'gender': {'type': 'categorical'},\n",
" 'age': {'type': 'numerical', 'subtype': 'integer'}}},\n",
" 'sessions': {'primary_key': 'session_id',\n",
" 'fields': {'session_id': {'type': 'id', 'subtype': 'integer'},\n",
" 'user_id': {'ref': {'field': 'user_id', 'table': 'users'},\n",
" 'type': 'id',\n",
" 'subtype': 'integer'},\n",
" 'device': {'type': 'categorical'},\n",
" 'os': {'type': 'categorical'}}},\n",
" 'transactions': {'primary_key': 'transaction_id',\n",
" 'fields': {'transaction_id': {'type': 'id', 'subtype': 'integer'},\n",
" 'session_id': {'ref': {'field': 'session_id', 'table': 'sessions'},\n",
" 'type': 'id',\n",
" 'subtype': 'integer'},\n",
" 'timestamp': {'type': 'datetime', 'format': '%Y-%m-%d'},\n",
" 'amount': {'type': 'numerical', 'subtype': 'float'},\n",
" 'approved': {'type': 'boolean'}}}}}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata.to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'users': user_id country gender age\n",
" 0 0 USA M 34\n",
" 1 1 UK F 23\n",
" 2 2 ES None 44\n",
" 3 3 UK M 22\n",
" 4 4 USA F 54\n",
" 5 5 DE M 57\n",
" 6 6 BG F 45\n",
" 7 7 ES None 41\n",
" 8 8 FR F 23\n",
" 9 9 UK None 30,\n",
" 'sessions': session_id user_id device os\n",
" 0 0 0 mobile android\n",
" 1 1 1 tablet ios\n",
" 2 2 1 tablet android\n",
" 3 3 2 mobile android\n",
" 4 4 4 mobile ios\n",
" 5 5 5 mobile android\n",
" 6 6 6 mobile ios\n",
" 7 7 6 tablet ios\n",
" 8 8 6 mobile ios\n",
" 9 9 8 tablet ios,\n",
" 'transactions': transaction_id session_id timestamp amount approved\n",
" 0 0 0 2019-01-01 12:34:32 100.0 True\n",
" 1 1 0 2019-01-01 12:42:21 55.3 True\n",
" 2 2 1 2019-01-07 17:23:11 79.5 True\n",
" 3 3 3 2019-01-10 11:08:57 112.1 False\n",
" 4 4 5 2019-01-10 21:54:08 110.0 False\n",
" 5 5 5 2019-01-11 11:21:20 76.3 True\n",
" 6 6 7 2019-01-22 14:44:10 89.5 True\n",
" 7 7 8 2019-01-23 10:14:09 132.1 False\n",
" 8 8 9 2019-01-27 16:09:17 68.0 True\n",
" 9 9 9 2019-01-29 12:10:48 99.9 True}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tables"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"metadata.validate()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"metadata.validate(tables)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"metadata._metadata['tables']['users']['primary_key'] = 'country'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "MetadataError",
"evalue": "id field `user_id` is neither a primary or a foreign key",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mMetadataError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-f10112f21d52>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmetadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Projects/MIT/SDV/sdv/metadata.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, tables)\u001b[0m\n\u001b[1;32m 612\u001b[0m \u001b[0mtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 614\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtable_meta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 615\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_circular_relationships\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_parents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/MIT/SDV/sdv/metadata.py\u001b[0m in \u001b[0;36m_validate_table\u001b[0;34m(self, table_name, table_meta, table_data)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0mmatch\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmetadata\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 526\u001b[0m \"\"\"\n\u001b[0;32m--> 527\u001b[0;31m \u001b[0mdtypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;31m# Primary key field exists and its type is 'id'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/MIT/SDV/sdv/metadata.py\u001b[0m in \u001b[0;36mget_dtypes\u001b[0;34m(self, table_name, ids)\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtable_meta\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'primary_key'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfield\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'ref'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 365\u001b[0m raise MetadataError(\n\u001b[0;32m--> 366\u001b[0;31m 'id field `{}` is neither a primary or a foreign key'.format(name))\n\u001b[0m\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mids\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfield_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'id'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mMetadataError\u001b[0m: id field `user_id` is neither a primary or a foreign key"
]
}
],
"source": [
"metadata.validate()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"metadata._metadata['tables']['users']['primary_key'] = 'user_id'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"metadata.validate()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"metadata._metadata['tables']['users']['fields']['gender']['type'] = 'numerical'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"metadata.validate()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"ename": "MetadataError",
"evalue": "Invalid values found in column gender of table users: could not convert string to float: 'M'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mMetadataError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-13-df0efeaaefea>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmetadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalidate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Projects/MIT/SDV/sdv/metadata.py\u001b[0m in \u001b[0;36mvalidate\u001b[0;34m(self, tables)\u001b[0m\n\u001b[1;32m 612\u001b[0m \u001b[0mtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 614\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtable_meta\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 615\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_circular_relationships\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 616\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_parents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/MIT/SDV/sdv/metadata.py\u001b[0m in \u001b[0;36m_validate_table\u001b[0;34m(self, table_name, table_meta, table_data)\u001b[0m\n\u001b[1;32m 546\u001b[0m message = 'Invalid values found in column {} of table {}: {}'.format(\n\u001b[1;32m 547\u001b[0m column, table_name, ve)\n\u001b[0;32m--> 548\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mMetadataError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 549\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0;31m# assert all dtypes are in data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mMetadataError\u001b[0m: Invalid values found in column gender of table users: could not convert string to float: 'M'"
]
}
],
"source": [
"metadata.validate(tables)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading