-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 53db330
Showing
11 changed files
with
36,928 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.DS_Store | ||
__pycache__ | ||
SPaRX/* | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
Kyle Hazen - 20 March 2020 | ||
|
||
SPARX Data Set were provided in the sas7bdat file format. | ||
|
||
To ease exploratory analysis these files were imported into a | ||
PostgreSQL database named "pdsas". | ||
|
||
The code in the jupyter notebook requires a connection to the pdsas database, | ||
hosted at localhost. To set up the pdsas database on OSX you can do the following: | ||
|
||
To install postgres on mac use homebrew. | ||
``` | ||
brew install postgresql | ||
``` | ||
|
||
To start the postgres background service use | ||
``` | ||
brew services start postgresql | ||
``` | ||
|
||
|
||
Make a database called pdsas | ||
``` | ||
psql postgres | ||
>postgres=# CREATE DATABASE pdsas; | ||
``` | ||
|
||
import 'pdsas.pgsql' to postgres with the following command. | ||
You should have the postgres background service running. | ||
|
||
<user> is the Owner of the pdsas table. | ||
``` | ||
psql -U <user> pdsas < pdsas.pgsql | ||
``` | ||
|
||
Edit the database.ini file with your username and password | ||
|
||
[postgresql] | ||
host=localhost | ||
database=pdsas | ||
user=<user> | ||
password=password | ||
|
||
Then open the jupyter notebook sparxv1.ipynb | ||
``` | ||
jupyter notebook sparxv1.ipynb | ||
``` | ||
|
||
|
||
Below steps documents how the data was imported into postgresql | ||
|
||
After initializing a postgresql database (pdsas) tables | ||
for each data set were created by running. | ||
```bash | ||
python maketables.py | ||
``` | ||
|
||
The tables were then populated with the bash script. | ||
```bash | ||
#!/usr/bin/env bash | ||
# Run in SPARK Data Sets directory to populate tables. | ||
for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done; | ||
``` | ||
|
||
And the ASCII encoding was fixed in the database by running. | ||
```bash | ||
python converteascii.py | ||
``` | ||
|
||
The case of the table names and table columns names was converted to lowercase | ||
by running | ||
```bash | ||
python lowercasetables.py | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# !/usr/bin/python | ||
from db import connect | ||
|
||
if __name__ == '__main__': | ||
|
||
conn = connect() | ||
cur = conn.cursor() | ||
|
||
sql = "SELECT table_name " | ||
sql += "FROM INFORMATION_SCHEMA.TABLES " | ||
sql += "WHERE table_schema = 'public' " | ||
|
||
cur.execute(sql) | ||
table_names = cur.fetchall() | ||
|
||
for tname in table_names: | ||
|
||
tname = tname[0] | ||
print('Fixing ascii in table: ' + tname) | ||
|
||
sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS" | ||
sql += f" WHERE data_type = 'text' AND table_name = '{tname}'" | ||
|
||
|
||
cur.execute(sql) | ||
columnames = cur.fetchall() | ||
for this_c_name in columnames: | ||
|
||
cnlower = False | ||
|
||
this_c_name = this_c_name[0] | ||
|
||
if this_c_name.islower(): | ||
|
||
cnlower = True | ||
|
||
sql = f'SELECT index, {this_c_name} FROM public."{tname}"' | ||
|
||
else: | ||
|
||
sql = f'SELECT index, "{this_c_name}" FROM public."{tname}"' | ||
|
||
|
||
cur.execute(sql) | ||
|
||
this_c = cur.fetchall() | ||
|
||
# try to convert if the first row in this_c starts with \\x | ||
try: | ||
if this_c[0][1].startswith('\\x'): | ||
|
||
for row in this_c: | ||
|
||
index = row[0] | ||
ascii = bytearray.fromhex(row[1].split('x')[1]).decode() | ||
|
||
ascii = ascii.replace("'","-") | ||
ascii = ascii.replace('"', "--") | ||
|
||
|
||
sql = f'UPDATE public."{tname}" ' | ||
|
||
if cnlower: | ||
sql += f"SET {this_c_name} " | ||
else: | ||
sql += f'SET "{this_c_name}" ' | ||
|
||
sql += f"= '{ascii}' WHERE index = {index} " | ||
print(sql) | ||
cur.execute(sql) | ||
conn.commit() | ||
|
||
except: | ||
|
||
# if the first row in this_c is None check the rows and convert | ||
if this_c[0][1] is None: | ||
|
||
for row in this_c: | ||
|
||
# continue if the value is None | ||
if row[1] is None: | ||
continue | ||
|
||
# else convert if starts with \\x | ||
else: | ||
if row[1].startswith('\\x'): | ||
|
||
index = row[0] | ||
ascii = bytearray.fromhex(row[1].split('x')[1]).decode() | ||
|
||
ascii = ascii.replace("'", "-") | ||
ascii = ascii.replace('"', "--") | ||
|
||
sql = f'UPDATE public."{tname}" ' | ||
|
||
if cnlower: | ||
sql += f"SET {this_c_name} " | ||
else: | ||
sql += f'SET "{this_c_name}" ' | ||
|
||
sql += f"= '{ascii}' WHERE index = {index} " | ||
|
||
print(sql) | ||
cur.execute(sql) | ||
conn.commit() | ||
|
||
else: | ||
continue | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[postgresql] | ||
host=localhost | ||
database=pdsas | ||
user=kyle | ||
password=password |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# !/usr/bin/python | ||
import psycopg2 | ||
from configparser import ConfigParser | ||
|
||
def config(filename='database.ini', section='postgresql'): | ||
# create a parser | ||
parser = ConfigParser() | ||
# read config file | ||
parser.read(filename) | ||
|
||
db = {} | ||
if parser.has_section(section): | ||
params = parser.items(section) | ||
for param in params: | ||
db[param[0]] = param[1] | ||
else: | ||
raise Exception(f'Section {section} not found in the {filename} file') | ||
|
||
return db | ||
|
||
|
||
def connect(): | ||
""" Connect to the PostgreSQL database server """ | ||
|
||
# read connection parameters | ||
params = config() | ||
|
||
# return connection | ||
return psycopg2.connect(**params) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/usr/bin/env bash | ||
# Run in SPARK Data Sets directory to populate tables. | ||
for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# !/usr/bin/python | ||
from db import connect | ||
|
||
if __name__ == '__main__': | ||
|
||
conn = connect() | ||
cur = conn.cursor() | ||
|
||
# get the table names | ||
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';") | ||
tablenames = cur.fetchall() | ||
tablenames = [x[0] for x in tablenames] | ||
|
||
# make all the table names lower case if they exist | ||
for table in tablenames: | ||
if not table.islower(): | ||
sql = f'ALTER TABLE public."{table}" RENAME TO ' | ||
sql += table.lower() | ||
print(sql) | ||
cur.execute(sql) | ||
conn.commit() | ||
else: | ||
continue | ||
|
||
# get the table names | ||
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';") | ||
tablenames = cur.fetchall() | ||
tablenames = [x[0] for x in tablenames] | ||
|
||
|
||
# make all the column names lower | ||
for table in tablenames: | ||
|
||
#get the column names | ||
cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name= '{table}'") | ||
cnames = cur.fetchall() | ||
cnames = [x[0] for x in cnames] | ||
|
||
for col in cnames: | ||
if not col.islower(): | ||
sql = f'ALTER TABLE {table} RENAME COLUMN "{col}" TO ' | ||
sql += col.lower() | ||
print(sql) | ||
cur.execute(sql) | ||
conn.commit() | ||
else: | ||
continue | ||
|
||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# !/usr/bin/python | ||
import os | ||
from db import connect | ||
|
||
if __name__ == '__main__': | ||
|
||
conn = connect() | ||
cur = conn.cursor() | ||
|
||
datasets = os.listdir('SPaRX/SPARX Data Sets') | ||
|
||
for ds in datasets: | ||
|
||
tname = os.path.splitext(ds)[0] | ||
|
||
# SQL string concat is very bad for security but OK | ||
|
||
sql = " SELECT EXISTS (" | ||
sql += " SELECT FROM information_schema.tables" | ||
sql += " WHERE table_schema = 'public'" | ||
sql += f" AND table_name = '{tname}'" | ||
sql += ")" | ||
|
||
cur.execute(sql) | ||
|
||
if cur.fetchone()[0]: | ||
continue | ||
else: | ||
sql = f"CREATE TABLE {tname}()" | ||
cur.execute(sql) | ||
conn.commit() | ||
|
||
conn.close() | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.