Skip to content

Commit

Permalink
init remote
Browse files Browse the repository at this point in the history
  • Loading branch information
k1sauce committed Mar 20, 2020
0 parents commit 53db330
Show file tree
Hide file tree
Showing 11 changed files with 36,928 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DS_Store
__pycache__
SPaRX/*
.idea
75 changes: 75 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
Kyle Hazen - 20 March 2020

SPARX Data Set were provided in the sas7bdat file format.

To ease exploratory analysis these files were imported into a
PostgreSQL database named "pdsas".

The code in the jupyter notebook requires a connection to the pdsas database,
hosted at localhost. To set up the pdsas database on OSX you can do the following:

To install postgres on mac use homebrew.
```
brew install postgresql
```

To start the postgres background service use
```
brew services start postgresql
```


Make a database called pdsas
```
psql postgres
>postgres=# CREATE DATABASE pdsas;
```

import 'pdsas.pgsql' to postgres with the following command.
You should have the postgres background service running.

<user> is the Owner of the pdsas table.
```
psql -U <user> pdsas < pdsas.pgsql
```

Edit the database.ini file with your username and password

[postgresql]
host=localhost
database=pdsas
user=<user>
password=password

Then open the jupyter notebook sparxv1.ipynb
```
jupyter notebook sparxv1.ipynb
```


Below steps documents how the data was imported into postgresql

After initializing a postgresql database (pdsas) tables
for each data set were created by running.
```bash
python maketables.py
```

The tables were then populated with the bash script.
```bash
#!/usr/bin/env bash
# Run in SPARK Data Sets directory to populate tables.
for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done;
```

And the ASCII encoding was fixed in the database by running.
```bash
python converteascii.py
```

The case of the table names and table columns names was converted to lowercase
by running
```bash
python lowercasetables.py
```

109 changes: 109 additions & 0 deletions converteascii.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# !/usr/bin/python
from db import connect

if __name__ == '__main__':

conn = connect()
cur = conn.cursor()

sql = "SELECT table_name "
sql += "FROM INFORMATION_SCHEMA.TABLES "
sql += "WHERE table_schema = 'public' "

cur.execute(sql)
table_names = cur.fetchall()

for tname in table_names:

tname = tname[0]
print('Fixing ascii in table: ' + tname)

sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS"
sql += f" WHERE data_type = 'text' AND table_name = '{tname}'"


cur.execute(sql)
columnames = cur.fetchall()
for this_c_name in columnames:

cnlower = False

this_c_name = this_c_name[0]

if this_c_name.islower():

cnlower = True

sql = f'SELECT index, {this_c_name} FROM public."{tname}"'

else:

sql = f'SELECT index, "{this_c_name}" FROM public."{tname}"'


cur.execute(sql)

this_c = cur.fetchall()

# try to convert if the first row in this_c starts with \\x
try:
if this_c[0][1].startswith('\\x'):

for row in this_c:

index = row[0]
ascii = bytearray.fromhex(row[1].split('x')[1]).decode()

ascii = ascii.replace("'","-")
ascii = ascii.replace('"', "--")


sql = f'UPDATE public."{tname}" '

if cnlower:
sql += f"SET {this_c_name} "
else:
sql += f'SET "{this_c_name}" '

sql += f"= '{ascii}' WHERE index = {index} "
print(sql)
cur.execute(sql)
conn.commit()

except:

# if the first row in this_c is None check the rows and convert
if this_c[0][1] is None:

for row in this_c:

# continue if the value is None
if row[1] is None:
continue

# else convert if starts with \\x
else:
if row[1].startswith('\\x'):

index = row[0]
ascii = bytearray.fromhex(row[1].split('x')[1]).decode()

ascii = ascii.replace("'", "-")
ascii = ascii.replace('"', "--")

sql = f'UPDATE public."{tname}" '

if cnlower:
sql += f"SET {this_c_name} "
else:
sql += f'SET "{this_c_name}" '

sql += f"= '{ascii}' WHERE index = {index} "

print(sql)
cur.execute(sql)
conn.commit()

else:
continue

5 changes: 5 additions & 0 deletions database.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[postgresql]
host=localhost
database=pdsas
user=kyle
password=password
29 changes: 29 additions & 0 deletions db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# !/usr/bin/python
import psycopg2
from configparser import ConfigParser

def config(filename='database.ini', section='postgresql'):
# create a parser
parser = ConfigParser()
# read config file
parser.read(filename)

db = {}
if parser.has_section(section):
params = parser.items(section)
for param in params:
db[param[0]] = param[1]
else:
raise Exception(f'Section {section} not found in the {filename} file')

return db


def connect():
""" Connect to the PostgreSQL database server """

# read connection parameters
params = config()

# return connection
return psycopg2.connect(**params)
3 changes: 3 additions & 0 deletions filltables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
# Run in SPARK Data Sets directory to populate tables.
for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done;
49 changes: 49 additions & 0 deletions lowercasetable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# !/usr/bin/python
from db import connect

if __name__ == '__main__':

conn = connect()
cur = conn.cursor()

# get the table names
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
tablenames = cur.fetchall()
tablenames = [x[0] for x in tablenames]

# make all the table names lower case if they exist
for table in tablenames:
if not table.islower():
sql = f'ALTER TABLE public."{table}" RENAME TO '
sql += table.lower()
print(sql)
cur.execute(sql)
conn.commit()
else:
continue

# get the table names
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
tablenames = cur.fetchall()
tablenames = [x[0] for x in tablenames]


# make all the column names lower
for table in tablenames:

#get the column names
cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name= '{table}'")
cnames = cur.fetchall()
cnames = [x[0] for x in cnames]

for col in cnames:
if not col.islower():
sql = f'ALTER TABLE {table} RENAME COLUMN "{col}" TO '
sql += col.lower()
print(sql)
cur.execute(sql)
conn.commit()
else:
continue

conn.close()
42 changes: 42 additions & 0 deletions maketables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# !/usr/bin/python
import os
from db import connect

if __name__ == '__main__':

conn = connect()
cur = conn.cursor()

datasets = os.listdir('SPaRX/SPARX Data Sets')

for ds in datasets:

tname = os.path.splitext(ds)[0]

# SQL string concat is very bad for security but OK

sql = " SELECT EXISTS ("
sql += " SELECT FROM information_schema.tables"
sql += " WHERE table_schema = 'public'"
sql += f" AND table_name = '{tname}'"
sql += ")"

cur.execute(sql)

if cur.fetchone()[0]:
continue
else:
sql = f"CREATE TABLE {tname}()"
cur.execute(sql)
conn.commit()

conn.close()









Loading

0 comments on commit 53db330

Please sign in to comment.