diff --git a/README.md b/README.md index 13c087e..410759e 100644 --- a/README.md +++ b/README.md @@ -113,13 +113,12 @@ building and tax period the data applies to. ### To import the CSV into postgres -*In progress* +You should have [docker4data](http://dockerfordata.com) installed and set up on +your system. -There are a few complicated dependencies here, including -[pgloader](http://pgloader.io), and a few external tables (PLUTO and the DHCR -stabilization building list history.) + ./reparse.sh - ./import.sh +This will directly parse the `data` folder into docker4data's postgres. ## Data Usage diff --git a/import.sh b/import.sh index 1c9a7c2..c2149d4 100755 --- a/import.sh +++ b/import.sh @@ -1,14 +1,9 @@ #!/bin/bash -e -# createdb stabilization 2>/dev/null || : - export PGPASSWORD=docker4data export PGUSER=postgres export PGHOST=localhost export PGPORT=54321 export PGDATABASE=postgres -# need to have pgloader installed -pgloader pgloader.load - psql -f cross-tab-rs-counts.sql diff --git a/parse.py b/parse.py index e7493d1..d57cf5a 100644 --- a/parse.py +++ b/parse.py @@ -30,6 +30,8 @@ "apts" ] +ROW_BUFFER = 10000 + BILL_PDF, STATEMENT_PDF, STATEMENT_HTML, NOPV_PDF, NOPV_HTML = ( 'Quarterly Property Tax Bill.pdf', 'Quarterly Statement of Account.pdf', 'Quarterly Statement of Account.html', 'Notice of Property Value.pdf', @@ -486,6 +488,7 @@ def main(root): #pylint: disable=too-many-locals,too-many-branches,too-many-stat """ writer = csv.DictWriter(sys.stdout, HEADERS) writer.writeheader() + rows_to_write = [] for path, _, files in os.walk(root): bbl_json = [] for filename in sorted(files): @@ -514,12 +517,10 @@ def main(root): #pylint: disable=too-many-locals,too-many-branches,too-many-stat file_data = handle.read() activity_through = parsedate(filename.split(' - ')[0]) for data in handler(file_data): - base = { - 'bbl': ''.join(bbl_array), - 'activityThrough': activity_through - } - base.update(data) - writer.writerow(base) + data['bbl'] = ''.join(bbl_array) + data['activityThrough'] = activity_through + #writer.writerow(base) + rows_to_write.append(data) bbl_json.append(data) except Exception as err: # pylint: disable=broad-except @@ -528,5 +529,11 @@ def main(root): #pylint: disable=too-many-locals,too-many-branches,too-many-stat with open(os.path.join(path, 'data.json'), 'w') as json_outfile: json.dump(bbl_json, json_outfile) + if len(rows_to_write) >= ROW_BUFFER: + writer.writerows(rows_to_write) + rows_to_write = [] + writer.writerows(rows_to_write) + + if __name__ == '__main__': main(sys.argv[1]) diff --git a/pgloader.load b/pgloader.load deleted file mode 100644 index b1b83f4..0000000 --- a/pgloader.load +++ /dev/null @@ -1,64 +0,0 @@ -LOAD CSV - FROM data/rawdata.csv - INTO postgresql://postgres:docker4data@localhost:54321/postgres?rawdata - WITH skip header = 2, - fields optionally enclosed by '"', - fields terminated by ',', - batch concurrency = 1, - batch rows = 1000, - batch size = 5MB - SET work_mem to '16MB', - maintenance_work_mem to '100 MB' - BEFORE LOAD DO - $$ drop table if exists rawdata; $$, - $$ create table if not exists rawdata ( - bbl bigint, - activityThrough DATE, - section TEXT, - key TEXT, - dueDate DATE, - activityDate DATE, - value TEXT, - meta TEXT, - apts TEXT - ); - $$; - -LOAD CSV - FROM data/rgb.csv - INTO postgresql://postgres:docker4data@localhost:54321/postgres?rgb - WITH skip header = 1, - fields optionally enclosed by '"', - fields terminated by ',', - batch concurrency = 1, - batch rows = 1000, - batch size = 5MB - SET work_mem to '16MB', - maintenance_work_mem to '100 MB' - BEFORE LOAD DO - $$ drop table if exists rgb; $$, - $$ create table if not exists rgb ( - source VARCHAR, - borough SMALLINT, - year INT, - add_421a INT, - add_421g INT, - add_420c INT, - add_j51 INT, - add_ML_buyout INT, - add_loft INT, - add_former_control REAL, - sub_high_rent_income INT, - sub_high_rent_vacancy INT, - sub_coop_condo_conversion INT, - sub_421a_expiration INT, - sub_j51_expiration INT, - sub_substantial_rehab INT, - sub_commercial_prof_conversion INT, - sub_other INT, - total_sub INT, - total_add REAL, - inflated VARCHAR, - net REAL - ); - $$; diff --git a/reparse.sh b/reparse.sh index a7eb76b..f9c5b1c 100755 --- a/reparse.sh +++ b/reparse.sh @@ -1,4 +1,50 @@ #!/bin/bash source .env/bin/activate -time python parse.py data/ >data/rawdata.csv 2>data/rawdata.log & + +export PGPASSWORD=docker4data +export PGUSER=postgres +export PGHOST=localhost +export PGPORT=54321 +export PGDATABASE=postgres + +psql -c 'drop table if exists rawdata cascade;' +psql -c 'create table rawdata ( + bbl bigint, + activityThrough DATE, + section TEXT, + key TEXT, + dueDate DATE, + activityDate DATE, + value TEXT, + meta TEXT, + apts TEXT + );' +psql -c 'drop table if exists rgb cascade;' +psql -c 'create table rgb ( + source VARCHAR, + borough SMALLINT, + year INT, + add_421a INT, + add_421g INT, + add_420c INT, + add_j51 INT, + add_ML_buyout INT, + add_loft INT, + add_former_control REAL, + sub_high_rent_income INT, + sub_high_rent_vacancy INT, + sub_coop_condo_conversion INT, + sub_421a_expiration INT, + sub_j51_expiration INT, + sub_substantial_rehab INT, + sub_commercial_prof_conversion INT, + sub_other INT, + total_sub INT, + total_add REAL, + inflated VARCHAR, + net REAL + );' +time cat data/rgb.csv | psql -c "COPY rgb FROM stdin WITH CSV HEADER NULL '' QUOTE'\"';" + +time python parse.py data/ 2>data/rawdata.log | psql -c "COPY rawdata FROM stdin WITH CSV HEADER NULL '' QUOTE '\"';"