init remote

k1sauce · Mar 20, 2020 · 53db330 · 53db330
commit 53db330
Show file tree

Hide file tree

Showing 11 changed files with 36,928 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+__pycache__
+SPaRX/*
+.idea
diff --git a/README.md b/README.md
@@ -0,0 +1,75 @@
+Kyle Hazen - 20 March 2020
+
+SPARX Data Set were provided in the sas7bdat file format.
+
+To ease exploratory analysis these files were imported into a 
+PostgreSQL database named "pdsas". 
+
+The code in the jupyter notebook requires a connection to the pdsas database,
+hosted at localhost. To set up the pdsas database on OSX you can do the following:
+
+To install postgres on mac use homebrew.
+```
+brew install postgresql
+```
+
+To start the postgres background service use
+```
+brew services start postgresql
+```
+
+
+Make a database called pdsas
+```
+psql postgres
+>postgres=# CREATE DATABASE pdsas; 
+```
+
+import 'pdsas.pgsql' to postgres with the following command. 
+You should have the postgres background service running. 
+
+<user> is the Owner of the pdsas table.
+```
+psql -U <user> pdsas < pdsas.pgsql
+```
+
+Edit the database.ini file with your username and password
+
+[postgresql]
+host=localhost
+database=pdsas
+user=<user>
+password=password
+
+Then open the jupyter notebook sparxv1.ipynb
+```
+jupyter notebook sparxv1.ipynb
+```
+
+
+Below steps documents how the data was imported into postgresql
+
+After initializing a postgresql database (pdsas) tables 
+for each data set were created by running.
+```bash
+python maketables.py
+```
+
+The tables were then populated with the bash script.
+```bash
+#!/usr/bin/env bash
+# Run in SPARK Data Sets directory to populate tables.
+for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done;
+```
+
+And the ASCII encoding was fixed in the database by running.
+```bash
+python converteascii.py
+```
+
+The case of the table names and table columns names was converted to lowercase
+by running
+```bash
+python lowercasetables.py
+```
+
diff --git a/converteascii.py b/converteascii.py
@@ -0,0 +1,109 @@
+# !/usr/bin/python
+from db import connect
+
+if __name__ == '__main__':
+
+    conn = connect()
+    cur = conn.cursor()
+
+    sql = "SELECT table_name "
+    sql += "FROM INFORMATION_SCHEMA.TABLES "
+    sql += "WHERE table_schema = 'public' "
+
+    cur.execute(sql)
+    table_names = cur.fetchall()
+
+    for tname in table_names:
+
+        tname = tname[0]
+        print('Fixing ascii in table: ' + tname)
+
+        sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS"
+        sql += f" WHERE data_type = 'text' AND table_name = '{tname}'"
+
+
+        cur.execute(sql)
+        columnames = cur.fetchall()
+        for this_c_name in columnames:
+
+            cnlower = False
+
+            this_c_name = this_c_name[0]
+
+            if this_c_name.islower():
+
+                cnlower = True
+
+                sql = f'SELECT index, {this_c_name} FROM public."{tname}"'
+
+            else:
+
+                sql = f'SELECT index, "{this_c_name}" FROM public."{tname}"'
+
+
+            cur.execute(sql)
+
+            this_c = cur.fetchall()
+
+            # try to convert if the first row in this_c starts with \\x
+            try:
+                if this_c[0][1].startswith('\\x'):
+
+                    for row in this_c:
+
+                        index = row[0]
+                        ascii = bytearray.fromhex(row[1].split('x')[1]).decode()
+
+                        ascii = ascii.replace("'","-")
+                        ascii = ascii.replace('"', "--")
+
+
+                        sql = f'UPDATE public."{tname}" '
+
+                        if cnlower:
+                            sql += f"SET {this_c_name} "
+                        else:
+                            sql += f'SET "{this_c_name}" '
+
+                        sql += f"= '{ascii}' WHERE index = {index} "
+                        print(sql)
+                        cur.execute(sql)
+                        conn.commit()
+
+            except:
+
+                # if the first row in this_c is None check the rows and convert
+                if this_c[0][1] is None:
+
+                    for row in this_c:
+
+                        # continue if the value is None
+                        if row[1] is None:
+                            continue
+
+                        # else convert if starts with \\x
+                        else:
+                            if row[1].startswith('\\x'):
+
+                                index = row[0]
+                                ascii = bytearray.fromhex(row[1].split('x')[1]).decode()
+
+                                ascii = ascii.replace("'", "-")
+                                ascii = ascii.replace('"', "--")
+
+                                sql = f'UPDATE public."{tname}" '
+
+                                if cnlower:
+                                    sql += f"SET {this_c_name} "
+                                else:
+                                    sql += f'SET "{this_c_name}" '
+
+                                sql += f"= '{ascii}' WHERE index = {index} "
+
+                                print(sql)
+                                cur.execute(sql)
+                                conn.commit()
+
+                            else:
+                                continue
+
diff --git a/database.ini b/database.ini
@@ -0,0 +1,5 @@
+[postgresql]
+host=localhost
+database=pdsas
+user=kyle
+password=password
diff --git a/db.py b/db.py
@@ -0,0 +1,29 @@
+# !/usr/bin/python
+import psycopg2
+from configparser import ConfigParser
+
+def config(filename='database.ini', section='postgresql'):
+    # create a parser
+    parser = ConfigParser()
+    # read config file
+    parser.read(filename)
+
+    db = {}
+    if parser.has_section(section):
+        params = parser.items(section)
+        for param in params:
+            db[param[0]] = param[1]
+    else:
+        raise Exception(f'Section {section} not found in the {filename} file')
+
+    return db
+
+
+def connect():
+    """ Connect to the PostgreSQL database server """
+
+    # read connection parameters
+    params = config()
+
+    # return connection
+    return psycopg2.connect(**params)
diff --git a/filltables.sh b/filltables.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+# Run in SPARK Data Sets directory to populate tables.
+for f in *.sas7bdat; do sas2db --db postgresql+psycopg2://kyle:password@localhost:5432/pdsas $f; done;
diff --git a/lowercasetable.py b/lowercasetable.py
@@ -0,0 +1,49 @@
+# !/usr/bin/python
+from db import connect
+
+if __name__ == '__main__':
+
+    conn = connect()
+    cur = conn.cursor()
+
+    # get the table names
+    cur.execute("SELECT table_name  FROM information_schema.tables WHERE table_schema = 'public';")
+    tablenames = cur.fetchall()
+    tablenames = [x[0] for x in tablenames]
+
+    # make all the table names lower case if they exist
+    for table in tablenames:
+        if not table.islower():
+            sql = f'ALTER TABLE public."{table}" RENAME TO '
+            sql += table.lower()
+            print(sql)
+            cur.execute(sql)
+            conn.commit()
+        else:
+            continue
+
+    # get the table names
+    cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
+    tablenames = cur.fetchall()
+    tablenames = [x[0] for x in tablenames]
+
+
+    # make all the column names lower
+    for table in tablenames:
+
+        #get the column names
+        cur.execute(f"SELECT column_name FROM information_schema.columns WHERE table_name= '{table}'")
+        cnames = cur.fetchall()
+        cnames = [x[0] for x in cnames]
+
+        for col in cnames:
+            if not col.islower():
+                sql = f'ALTER TABLE {table} RENAME COLUMN "{col}" TO '
+                sql += col.lower()
+                print(sql)
+                cur.execute(sql)
+                conn.commit()
+            else:
+                continue
+
+    conn.close()
diff --git a/maketables.py b/maketables.py
@@ -0,0 +1,42 @@
+# !/usr/bin/python
+import os
+from db import connect
+
+if __name__ == '__main__':
+
+    conn = connect()
+    cur = conn.cursor()
+
+    datasets = os.listdir('SPaRX/SPARX Data Sets')
+
+    for ds in datasets:
+
+        tname = os.path.splitext(ds)[0]
+
+        # SQL string concat is very bad for security but OK
+
+        sql = " SELECT EXISTS ("
+        sql += " SELECT FROM information_schema.tables"
+        sql += " WHERE table_schema = 'public'"
+        sql += f" AND table_name = '{tname}'"
+        sql += ")"
+
+        cur.execute(sql)
+
+        if cur.fetchone()[0]:
+            continue
+        else:
+            sql = f"CREATE TABLE {tname}()"
+            cur.execute(sql)
+            conn.commit()
+
+    conn.close()
+
+
+
+
+
+
+
+
+