Skip to content

Commit

Permalink
[feat] moved geocoding and kdm updating for ores to database, wrote f…
Browse files Browse the repository at this point in the history
…unction to delete nyiso withdrawn functions, include last_updated field for nyiso
  • Loading branch information
deenasun committed Nov 17, 2024
1 parent b30b016 commit f34c561
Show file tree
Hide file tree
Showing 9 changed files with 247 additions and 144 deletions.
Binary file modified api/webscraper/__pycache__/database_constants.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyiso_scraper.cpython-312.pyc
Binary file not shown.
Binary file modified api/webscraper/__pycache__/nyserda_scraper.cpython-312.pyc
Binary file not shown.
281 changes: 177 additions & 104 deletions api/webscraper/database.py

Large diffs are not rendered by default.

10 changes: 0 additions & 10 deletions api/webscraper/database_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,6 @@
"date": None,
},
{"milestoneTitle": "Start of operations", "completed": False, "date": None},
{
"milestoneTitle": "Application for permit to ORES",
"completed": False,
"date": None,
},
{
"milestoneTitle": "Issuance of permit from ORES",
"completed": False,
"date": None,
},
]

project_fields = [
Expand Down
54 changes: 49 additions & 5 deletions api/webscraper/nyiso_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def query_nyiso_excel():


def query_nyiso():
"""
Queries for all the projects in the NYISO sheet and filters
Outdated - does not modify behavior based on which sheet the project is from (Interconnection Queue, Cluster Projects, In Service)
returns: list of dictionaries representing the projects
"""
if nyiso_xlsx_href is None:
print('ERROR: "View the Interconnection Queue" link not found')
return
Expand All @@ -50,7 +55,7 @@ def query_nyiso():
continue
project_dict = {
"project_name": item.get("Project Name", None),
"project_status": "Proposed", # TODO: update this based on which sheet it's from
"project_status": "Proposed",
"renewable_energy_technology": renewable_energy_abbreviations[
item.get("Type/ Fuel")
], # map abbreviations into readable string
Expand Down Expand Up @@ -95,7 +100,9 @@ def filter_nyiso_list(project_list, sheet_name):
for item in project_list:
if sheet_name == "Interconnection Queue" and item.get("State") != "NY":
continue
elif sheet_name == "Cluster Projects" and item.get("State", None) != "New York":
elif sheet_name == "Cluster Projects" and not (
item.get("State", None) == "New York" or item.get("State", None) == "NY"
):
continue
elif sheet_name == "In Service" and item.get("State", None) != "NY":
continue
Expand All @@ -111,13 +118,22 @@ def filter_nyiso_list(project_list, sheet_name):
"developer": item.get("Developer Name", None),
"proposed_cod": item.get(
"Proposed COD", None
), # note: non-serializable into JSON --> can't directly write to file
), # NOTE: non-serializable into JSON --> can't directly write to file
"county": item.get("County", None),
"region": None, # missing
"zipcode": None, # missing
"latitude": None,
"longitude": None,
# 'data_through_date': item.get('Last Updated Date', None),
"last_updated": (
item.get("Last Updated Date", None) # NOTE: non-serializable into JSON
if (
sheet_name == "Interconnection Queue"
or sheet_name == "Cluster Projects"
)
else item.get(
"Last Update NaT", None
) # NOTE: the column header for the in-service sheet is called "Last Update NaT"
),
"key_development_milestones": None,
"project_image": None,
"interconnection_queue_number": item.get("Queue Pos.", None),
Expand Down Expand Up @@ -155,7 +171,7 @@ def filter_nyiso_cluster_sheet():
cluster_projects_df = clean_df_data(cluster_projects_df)
cluster_projects_list = cluster_projects_df.to_dict(orient="records")

filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Project")
filtered_list = filter_nyiso_list(cluster_projects_list, "Cluster Projects")
return filtered_list


Expand Down Expand Up @@ -184,9 +200,37 @@ def filter_nyiso_in_service_sheet():
return filtered_list


def filter_nyiso_withdrawn_sheets():
"""
Returns a list of objects containing the key: "project_name" of withdrawn projects
"""
all_sheets = query_nyiso_excel()
sheet_names = list(all_sheets.keys())
withdrawn_key = sheet_names[2] # gets the sheet named "Withdrawn"
cluster_withdrawn_key = sheet_names[3] # gets the sheet named "Cluster Withdrawn"

withdrawn_df = all_sheets[withdrawn_key]
withdrawn_df = clean_df_data(withdrawn_df)
withdrawn_list = withdrawn_df.to_dict(orient="records")

cluster_withdrawn_df = all_sheets[cluster_withdrawn_key]
cluster_withdrawn_df = clean_df_data(cluster_withdrawn_df)
cluster_withdrawn_list = cluster_withdrawn_df.to_dict(orient="records")

withdrawn_list = withdrawn_list + cluster_withdrawn_list
filtered_list = [
{"project_name": item.get("Project Name", None)}
for item in withdrawn_list
if item.get("Project Name", None) is not None
]
return filtered_list


"""
For testing
"""
# write_nyiso_to_json()
# print(filter_nyiso_iq_sheet())
# print(filter_nyiso_in_service_sheet())
# print(filter_nyiso_cluster_sheet())
# print(filter_nyiso_withdrawn_sheets())
8 changes: 8 additions & 0 deletions api/webscraper/nyserda_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ def query_nyserda_solar(offset=0, limit=1000):

if size_in_mw is None or size_in_mw < 2:
continue
if (
item.get("project_id", None) is None
): # some projects have no project_id, so we skip them
continue

if check_status(item.get("project_status", None)) != "Cancelled":
project_dict = {
Expand Down Expand Up @@ -175,3 +179,7 @@ def write_small_to_json():
with open("api/webscraper/nyserda_small.json", "w") as file:
json.dump(project_list, file, indent=4)
file.write("\n")


test = query_nyserda_solar_repeat()
print(test[-10:])
33 changes: 9 additions & 24 deletions api/webscraper/ores_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from utils.scraper_utils import geocode_lat_long, update_kdm
from utils.scraper_utils import geocode_lat_long
from database_constants import initial_kdm_dict

# url = "https://dps.ny.gov/ores-permit-applications"
Expand Down Expand Up @@ -65,15 +65,13 @@ def filter_noi(data: list) -> list:
filtered_list = []
for row in data:
town, county = parse_for_location(row["Description"])
# TODO: move this parsing to the database.py file
lat, long = geocode_lat_long(f"{town}, NY")
project_dict = {
"permit_application_number": row.get("Permit Application Number", None),
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"latitude": None, # geocoding for lat/long is handled when inserting into database
"longitude": None,
"key_development_milestones": initial_kdm_dict,
}
filtered_list.append(project_dict)
Expand All @@ -90,22 +88,15 @@ def filter_under_review(data: list) -> list:
filtered_list = []
for row in data:
town, county = parse_for_location(row["Description"])
lat, long = geocode_lat_long(f"{town}, NY")
project_dict = {
"permit_application_number": row.get("Permit Application Number", None),
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"key_development_milestones": initial_kdm_dict,
"latitude": None, # geocoding for lat/long is handled when inserting into database
"longitude": None,
"key_development_milestones": initial_kdm_dict, # updating kdm for projects under review is handled in database.py
}
project_dict["key_development_milestones"] = update_kdm(
"Application for permit to ORES",
date=None,
completed=True,
kdm=project_dict.get("key_development_milestones"),
)
filtered_list.append(project_dict)
return filtered_list

Expand All @@ -126,16 +117,10 @@ def filter_permitted(data):
"project_name": row.get("Project Name", None),
"town": town if town else None,
"county": county if county else None,
"latitude": lat if lat else None,
"longitude": long if long else None,
"key_development_milestones": initial_kdm_dict,
"latitude": None, # geocoding for lat/long is handled when inserting into database
"longitude": None,
"key_development_milestones": initial_kdm_dict, # updating kdm for permitted projects is handled in database.py
}
project_dict["key_development_milestones"] = update_kdm(
"Issuance of permit from ORES",
date=None,
completed=True,
kdm=project_dict.get("key_development_milestones"),
)
filtered_list.append(project_dict)
return filtered_list

Expand Down
5 changes: 4 additions & 1 deletion api/webscraper/utils/scraper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def create_update_object(existing_project, new_project):
if value is None and new_project.get(key, None) is not None:
update_object[key] = new_project[key]
# add field if existing project's value differs from new project's value
elif value != new_project[key] and new_project.get(key, None) is not None:
elif (
value != new_project.get(key, None)
and new_project.get(key, None) is not None
):
update_object[key] = new_project[key]
return update_object

Expand Down

0 comments on commit f34c561

Please sign in to comment.