diff --git a/api/webscraper/__pycache__/database_constants.cpython-312.pyc b/api/webscraper/__pycache__/database_constants.cpython-312.pyc index f63fbf0..4239b59 100644 Binary files a/api/webscraper/__pycache__/database_constants.cpython-312.pyc and b/api/webscraper/__pycache__/database_constants.cpython-312.pyc differ diff --git a/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc b/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc index dcee55d..92d2556 100644 Binary files a/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc and b/api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc differ diff --git a/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc b/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc index 74a9064..0b6f860 100644 Binary files a/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc and b/api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc differ diff --git a/api/webscraper/database.py b/api/webscraper/database.py index c2b6572..291018b 100644 --- a/api/webscraper/database.py +++ b/api/webscraper/database.py @@ -1,16 +1,14 @@ -import json import os -import pandas as pd from datetime import datetime from supabase import create_client, Client from geocodio import GeocodioClient from nyserda_scraper import query_nyserda_large, query_nyserda_solar_repeat from nyiso_scraper import ( - query_nyiso, filter_nyiso_iq_sheet, filter_nyiso_cluster_sheet, filter_nyiso_in_service_sheet, + filter_nyiso_withdrawn_sheets, ) from ores_scraper import query_ores_noi, query_ores_under_review, query_ores_permitted from utils.scraper_utils import ( @@ -25,16 +23,19 @@ url: str = os.environ.get("NEXT_PUBLIC_SUPABASE_URL") key: str = os.environ.get("NEXT_PUBLIC_SUPABASE_ANON_KEY") supabase: Client = create_client(url, key) -supabase_table: str = "Projects_duplicate" +supabase_table: str = ( + "Projects_user_testing" # TODO: modify based on which table in supabase we want to edit +) geocode_api: str = os.environ.get("NEXT_PUBLIC_GEOCODIO_API_KEY") geocodio = GeocodioClient(geocode_api) - # NOTE: Supabase date objects follow the format YYYY-MM-DD -def nyserda_large_to_database(): + + +def nyserda_large_to_database() -> None: database = [] - database.extend(query_nyserda_large()[:10]) + database.extend(query_nyserda_large()) for project in database: if project.get("proposed_cod", None) is not None: ymd = datetime.strptime(project.get("proposed_cod"), "%Y").strftime( @@ -94,6 +95,14 @@ def nyserda_large_to_database(): # delete this field before pushing to supabase if "nyserda_contract_date" in project: del project["nyserda_contract_date"] + + # if project status is operational, also update "Start of operations" key development milestone + update_object["key_development_milestones"] = update_kdm( + "Start of operations", + date=project.get("proposed_cod"), + completed=True, + kdm=update_object["key_development_milestones"], + ) try: response = ( supabase.table(supabase_table) @@ -152,7 +161,7 @@ def nyserda_large_to_database(): print(exception) -def nyserda_solar_to_database(): +def nyserda_solar_to_database() -> None: database = [] database.extend(query_nyserda_solar_repeat()) for project in database: @@ -260,110 +269,159 @@ def nyserda_solar_to_database(): print(exception) -def nyiso_to_database(): - database = [] - database.extend(filter_nyiso_iq_sheet()) - database.extend(filter_nyiso_cluster_sheet()) - database.extend(filter_nyiso_in_service_sheet()) - for project in database: - if project.get("proposed_cod", None) is not None: - try: - ymd = datetime.strptime(project.get("proposed_cod"), "%m-%Y").strftime( - "%Y-%m-%d" - ) - except Exception as exception: - ymd = datetime.strptime(project.get("proposed_cod"), "%m/%Y").strftime( - "%Y-%m-%d" +def nyiso_to_database() -> None: + # helper function to handle different actions based on which sheet the data is from + def nyiso_to_database_helper(projects, sheet_name): + for project in projects: + # turns the proposed_cod Datetime objects in datetime strings + if project.get("proposed_cod", None) is not None: + try: + ymd = datetime.strptime( + project.get("proposed_cod"), "%m-%Y" + ).strftime("%Y-%m-%d") + except Exception as exception: + ymd = datetime.strptime( + project.get("proposed_cod"), "%m/%Y" + ).strftime("%Y-%m-%d") + except Exception as exception: + print(exception) + project["proposed_cod"] = ymd + + # turns last_updated Timestamp objects into datetime strings + if project.get("last_updated", None) is not None: + try: + ymd = project.get("last_updated").strftime("%Y-%m-%d") + except Exception as exception: + print(exception) + project["last_updated"] = ymd + + existing_data = ( + supabase.table(supabase_table) + .select("*") + .eq( + "interconnection_queue_number", + project["interconnection_queue_number"], ) - except Exception as exception: - print(exception) - project["proposed_cod"] = ymd + .execute() + ) + if len(existing_data.data) > 0: + existing_project = existing_data.data[0] + + # This helper function creates a dict of only fields that the existing project is missing + # but the NYISO data has + update_object = create_update_object(existing_project, project) + if ( + existing_project["key_development_milestones"] is None + or len(existing_project["key_development_milestones"]) < 0 + ): + update_object["key_development_milestones"] = initial_kdm_dict + else: + update_object["key_development_milestones"] = existing_project[ + "key_development_milestones" + ] + + # updating key development milestones + if project.get("date_of_ir", None) is not None: + entry_date = project.get("date_of_ir") + entry_date = entry_date.strftime("%Y-%m-%d") + update_object["key_development_milestones"] = update_kdm( + milestoneTitle="Entry to NYISO Queue", + completed=True, + date=entry_date, + kdm=update_object["key_development_milestones"], + ) + if project.get("ia_tender_date", None) is not None: + ia_date = project.get("ia_tender_date") + current_date = datetime.now() + completed = ia_date < current_date + ia_date = ia_date.strftime("%Y-%m-%d") + update_object["key_development_milestones"] = update_kdm( + milestoneTitle="Tendering of an Interconnection Agreement (IA)", + completed=completed, + date=ia_date, + kdm=update_object["key_development_milestones"], + ) + if sheet_name == "In Service": + update_object["key_development_milestones"] = update_kdm( + milestoneTitle="Start of operations", + date=None, + completed=True, + kdm=update_object["key_development_milestones"], + ) + try: + response = ( + supabase.table(supabase_table) + .update(update_object) + .eq( + "interconnection_queue_number", + project["interconnection_queue_number"], + ) + .execute() + ) + print("UPDATE", response, "\n") + except Exception as exception: + print(exception) + else: + # appending key development milestones + project["key_development_milestones"] = initial_kdm_dict + if project.get("date_of_ir", None) is not None: + entry_date = project.get("date_of_ir") + entry_date = entry_date.strftime("%Y-%m-%d") + project["key_development_milestones"] = update_kdm( + milestoneTitle="Entry to NYISO Queue", + completed=True, + date=entry_date, + kdm=project["key_development_milestones"], + ) + if project.get("ia_tender_date", None) is not None: + ia_date = project.get("ia_tender_date") + current_date = datetime.now() + completed = ia_date < current_date + ia_date = ia_date.strftime("%Y-%m-%d") + project["key_development_milestones"] = update_kdm( + milestoneTitle="Tendering of an Interconnection Agreement (IA)", + completed=completed, + date=ia_date, + kdm=project["key_development_milestones"], + ) + if "date_of_ir" in project: + del project["date_of_ir"] + if "ia_tender_date" in project: + del project["ia_tender_date"] + try: + response = supabase.table(supabase_table).insert(project).execute() + print("INSERT", response, "\n") + except Exception as exception: + print(exception) + + # call helper function for each sheet with the corresponding sheet name + nyiso_to_database_helper(filter_nyiso_iq_sheet(), "Interconnection Queue") + nyiso_to_database_helper(filter_nyiso_cluster_sheet(), "Cluster Projects") + nyiso_to_database_helper(filter_nyiso_in_service_sheet(), "In Service") + + +def check_withdrawn_nyiso_in_database() -> None: + """ + This function uses projects queried from the Withdrawn and Cluster Projects-Withdrawn sheet of NYISO + to delete any currently stored projects in the databasethat have been withdrawn. + """ + withdrawn_projects = filter_nyiso_withdrawn_sheets() + for project in withdrawn_projects: existing_data = ( supabase.table(supabase_table) .select("*") - .eq("interconnection_queue_number", project["interconnection_queue_number"]) + .eq("project_name", project["project_name"]) .execute() ) if len(existing_data.data) > 0: - existing_project = existing_data.data[0] - - # This helper function creates a dict of only fields that the existing project is missing - # but the NYISO data has - update_object = create_update_object(existing_project, project) - if ( - existing_project["key_development_milestones"] is None - or len(existing_project["key_development_milestones"]) < 0 - ): - update_object["key_development_milestones"] = initial_kdm_dict - else: - update_object["key_development_milestones"] = existing_project[ - "key_development_milestones" - ] - - # updating key development milestones - if project.get("date_of_ir", None) is not None: - entry_date = project.get("date_of_ir") - entry_date = entry_date.strftime("%Y-%m-%d") - update_object["key_development_milestones"] = update_kdm( - milestoneTitle="Entry to NYISO Queue", - completed=True, - date=entry_date, - kdm=update_object["key_development_milestones"], - ) - if project.get("ia_tender_date", None) is not None: - ia_date = project.get("ia_tender_date") - current_date = datetime.now() - completed = ia_date < current_date - ia_date = ia_date.strftime("%Y-%m-%d") - update_object["key_development_milestones"] = update_kdm( - milestoneTitle="Tendering of an Interconnection Agreement (IA)", - completed=completed, - date=ia_date, - kdm=update_object["key_development_milestones"], - ) try: response = ( supabase.table(supabase_table) - .update(update_object) - .eq( - "interconnection_queue_number", - project["interconnection_queue_number"], - ) + .delete() + .eq("project_name", project["project_name"]) .execute() ) - print("UPDATE", response, "\n") - except Exception as exception: - print(exception) - else: - # appending key development milestones - project["key_development_milestones"] = initial_kdm_dict - if project.get("date_of_ir", None) is not None: - entry_date = project.get("date_of_ir") - entry_date = entry_date.strftime("%Y-%m-%d") - project["key_development_milestones"] = update_kdm( - milestoneTitle="Entry to NYISO Queue", - completed=True, - date=entry_date, - kdm=project["key_development_milestones"], - ) - if project.get("ia_tender_date", None) is not None: - ia_date = project.get("ia_tender_date") - current_date = datetime.now() - completed = ia_date < current_date - ia_date = ia_date.strftime("%Y-%m-%d") - project["key_development_milestones"] = update_kdm( - milestoneTitle="Tendering of an Interconnection Agreement (IA)", - completed=completed, - date=ia_date, - kdm=project["key_development_milestones"], - ) - if "date_of_ir" in project: - del project["date_of_ir"] - if "ia_tender_date" in project: - del project["ia_tender_date"] - try: - response = supabase.table(supabase_table).insert(project).execute() - print("INSERT", response, "\n") + print("DELETE", response, "\n") except Exception as exception: print(exception) @@ -395,6 +453,10 @@ def ores_noi_to_database(): except Exception as exception: print(exception) else: + if project.get("town", None) is not None: + lat, long = geocode_lat_long(f"{project['town']}, NY") + project["latitude"] = lat + project["longitude"] = long try: response = supabase.table(supabase_table).insert(project).execute() print("INSERT", response, "\n") @@ -402,7 +464,7 @@ def ores_noi_to_database(): print(exception) -def ores_under_review_to_database(): +def ores_under_review_to_database() -> None: database = [] database.extend(query_ores_under_review()) for project in database: @@ -447,6 +509,11 @@ def ores_under_review_to_database(): except Exception as exception: print(exception) else: + if project.get("town", None) is not None: + lat, long = geocode_lat_long(f"{project['town']}, NY") + project["latitude"] = lat + project["longitude"] = long + project["key_development_milestones"] = update_kdm( milestoneTitle="Application for permit to ORES", completed=True, @@ -460,7 +527,7 @@ def ores_under_review_to_database(): print(exception) -def ores_permitted_to_database(): +def ores_permitted_to_database() -> None: database = [] database.extend(query_ores_permitted()) for project in database: @@ -505,6 +572,11 @@ def ores_permitted_to_database(): except Exception as exception: print(exception) else: + if project.get("town", None) is not None: + lat, long = geocode_lat_long(f"{project['town']}, NY") + project["latitude"] = lat + project["longitude"] = long + project["key_development_milestones"] = update_kdm( milestoneTitle="Issuance of permit from ORES", completed=True, @@ -521,9 +593,10 @@ def ores_permitted_to_database(): """ For testing """ -nyserda_large_to_database() -# nyserda_solar_to_database() +# nyserda_large_to_database() +nyserda_solar_to_database() # nyiso_to_database() # ores_noi_to_database() # ores_under_review_to_database() # ores_permitted_to_database() +# check_withdrawn_nyiso_in_database() diff --git a/api/webscraper/database_constants.py b/api/webscraper/database_constants.py index c94bf56..08a2f7d 100644 --- a/api/webscraper/database_constants.py +++ b/api/webscraper/database_constants.py @@ -42,16 +42,6 @@ "date": None, }, {"milestoneTitle": "Start of operations", "completed": False, "date": None}, - { - "milestoneTitle": "Application for permit to ORES", - "completed": False, - "date": None, - }, - { - "milestoneTitle": "Issuance of permit from ORES", - "completed": False, - "date": None, - }, ] project_fields = [ diff --git a/api/webscraper/nyiso_scraper.py b/api/webscraper/nyiso_scraper.py index e4fbe63..17aaec2 100644 --- a/api/webscraper/nyiso_scraper.py +++ b/api/webscraper/nyiso_scraper.py @@ -30,6 +30,11 @@ def query_nyiso_excel(): def query_nyiso(): + """ + Queries for all the projects in the NYISO sheet and filters + Outdated - does not modify behavior based on which sheet the project is from (Interconnection Queue, Cluster Projects, In Service) + returns: list of dictionaries representing the projects + """ if nyiso_xlsx_href is None: print('ERROR: "View the Interconnection Queue" link not found') return @@ -50,7 +55,7 @@ def query_nyiso(): continue project_dict = { "project_name": item.get("Project Name", None), - "project_status": "Proposed", # TODO: update this based on which sheet it's from + "project_status": "Proposed", "renewable_energy_technology": renewable_energy_abbreviations[ item.get("Type/ Fuel") ], # map abbreviations into readable string @@ -95,7 +100,9 @@ def filter_nyiso_list(project_list, sheet_name): for item in project_list: if sheet_name == "Interconnection Queue" and item.get("State") != "NY": continue - elif sheet_name == "Cluster Projects" and item.get("State", None) != "New York": + elif sheet_name == "Cluster Projects" and not ( + item.get("State", None) == "New York" or item.get("State", None) == "NY" + ): continue elif sheet_name == "In Service" and item.get("State", None) != "NY": continue @@ -111,13 +118,22 @@ def filter_nyiso_list(project_list, sheet_name): "developer": item.get("Developer Name", None), "proposed_cod": item.get( "Proposed COD", None - ), # note: non-serializable into JSON --> can't directly write to file + ), # NOTE: non-serializable into JSON --> can't directly write to file "county": item.get("County", None), "region": None, # missing "zipcode": None, # missing "latitude": None, "longitude": None, - # 'data_through_date': item.get('Last Updated Date', None), + "last_updated": ( + item.get("Last Updated Date", None) # NOTE: non-serializable into JSON + if ( + sheet_name == "Interconnection Queue" + or sheet_name == "Cluster Projects" + ) + else item.get( + "Last Update NaT", None + ) # NOTE: the column header for the in-service sheet is called "Last Update NaT" + ), "key_development_milestones": None, "project_image": None, "interconnection_queue_number": item.get("Queue Pos.", None), @@ -155,7 +171,7 @@ def filter_nyiso_cluster_sheet(): cluster_projects_df = clean_df_data(cluster_projects_df) cluster_projects_list = cluster_projects_df.to_dict(orient="records") - filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Project") + filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Projects") return filtered_list @@ -184,9 +200,37 @@ def filter_nyiso_in_service_sheet(): return filtered_list +def filter_nyiso_withdrawn_sheets(): + """ + Returns a list of objects containing the key: "project_name" of withdrawn projects + """ + all_sheets = query_nyiso_excel() + sheet_names = list(all_sheets.keys()) + withdrawn_key = sheet_names[2] # gets the sheet named "Withdrawn" + cluster_withdrawn_key = sheet_names[3] # gets the sheet named "Cluster Withdrawn" + + withdrawn_df = all_sheets[withdrawn_key] + withdrawn_df = clean_df_data(withdrawn_df) + withdrawn_list = withdrawn_df.to_dict(orient="records") + + cluster_withdrawn_df = all_sheets[cluster_withdrawn_key] + cluster_withdrawn_df = clean_df_data(cluster_withdrawn_df) + cluster_withdrawn_list = cluster_withdrawn_df.to_dict(orient="records") + + withdrawn_list = withdrawn_list + cluster_withdrawn_list + filtered_list = [ + {"project_name": item.get("Project Name", None)} + for item in withdrawn_list + if item.get("Project Name", None) is not None + ] + return filtered_list + + """ For testing """ # write_nyiso_to_json() +# print(filter_nyiso_iq_sheet()) # print(filter_nyiso_in_service_sheet()) # print(filter_nyiso_cluster_sheet()) +# print(filter_nyiso_withdrawn_sheets()) diff --git a/api/webscraper/nyserda_scraper.py b/api/webscraper/nyserda_scraper.py index 0043499..ff2f9f1 100644 --- a/api/webscraper/nyserda_scraper.py +++ b/api/webscraper/nyserda_scraper.py @@ -115,6 +115,10 @@ def query_nyserda_solar(offset=0, limit=1000): if size_in_mw is None or size_in_mw < 2: continue + if ( + item.get("project_id", None) is None + ): # some projects have no project_id, so we skip them + continue if check_status(item.get("project_status", None)) != "Cancelled": project_dict = { @@ -175,3 +179,7 @@ def write_small_to_json(): with open("api/webscraper/nyserda_small.json", "w") as file: json.dump(project_list, file, indent=4) file.write("\n") + + +test = query_nyserda_solar_repeat() +print(test[-10:]) diff --git a/api/webscraper/ores_scraper.py b/api/webscraper/ores_scraper.py index 36114c2..cbb78dc 100644 --- a/api/webscraper/ores_scraper.py +++ b/api/webscraper/ores_scraper.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup import pandas as pd from io import StringIO -from utils.scraper_utils import geocode_lat_long, update_kdm +from utils.scraper_utils import geocode_lat_long from database_constants import initial_kdm_dict # url = "https://dps.ny.gov/ores-permit-applications" @@ -65,15 +65,13 @@ def filter_noi(data: list) -> list: filtered_list = [] for row in data: town, county = parse_for_location(row["Description"]) - # TODO: move this parsing to the database.py file - lat, long = geocode_lat_long(f"{town}, NY") project_dict = { "permit_application_number": row.get("Permit Application Number", None), "project_name": row.get("Project Name", None), "town": town if town else None, "county": county if county else None, - "latitude": lat if lat else None, - "longitude": long if long else None, + "latitude": None, # geocoding for lat/long is handled when inserting into database + "longitude": None, "key_development_milestones": initial_kdm_dict, } filtered_list.append(project_dict) @@ -90,22 +88,15 @@ def filter_under_review(data: list) -> list: filtered_list = [] for row in data: town, county = parse_for_location(row["Description"]) - lat, long = geocode_lat_long(f"{town}, NY") project_dict = { "permit_application_number": row.get("Permit Application Number", None), "project_name": row.get("Project Name", None), "town": town if town else None, "county": county if county else None, - "latitude": lat if lat else None, - "longitude": long if long else None, - "key_development_milestones": initial_kdm_dict, + "latitude": None, # geocoding for lat/long is handled when inserting into database + "longitude": None, + "key_development_milestones": initial_kdm_dict, # updating kdm for projects under review is handled in database.py } - project_dict["key_development_milestones"] = update_kdm( - "Application for permit to ORES", - date=None, - completed=True, - kdm=project_dict.get("key_development_milestones"), - ) filtered_list.append(project_dict) return filtered_list @@ -126,16 +117,10 @@ def filter_permitted(data): "project_name": row.get("Project Name", None), "town": town if town else None, "county": county if county else None, - "latitude": lat if lat else None, - "longitude": long if long else None, - "key_development_milestones": initial_kdm_dict, + "latitude": None, # geocoding for lat/long is handled when inserting into database + "longitude": None, + "key_development_milestones": initial_kdm_dict, # updating kdm for permitted projects is handled in database.py } - project_dict["key_development_milestones"] = update_kdm( - "Issuance of permit from ORES", - date=None, - completed=True, - kdm=project_dict.get("key_development_milestones"), - ) filtered_list.append(project_dict) return filtered_list diff --git a/api/webscraper/utils/scraper_utils.py b/api/webscraper/utils/scraper_utils.py index 256c5c4..21e615c 100644 --- a/api/webscraper/utils/scraper_utils.py +++ b/api/webscraper/utils/scraper_utils.py @@ -47,7 +47,10 @@ def create_update_object(existing_project, new_project): if value is None and new_project.get(key, None) is not None: update_object[key] = new_project[key] # add field if existing project's value differs from new project's value - elif value != new_project[key] and new_project.get(key, None) is not None: + elif ( + value != new_project.get(key, None) + and new_project.get(key, None) is not None + ): update_object[key] = new_project[key] return update_object