Skip to content

Commit

Permalink
fix: add where constraints to raw data export
Browse files Browse the repository at this point in the history
  • Loading branch information
alee committed Sep 6, 2024
1 parent e5eab87 commit 9d31e82
Showing 1 changed file with 39 additions and 17 deletions.
56 changes: 39 additions & 17 deletions django/curator/management/commands/export_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@


class Command(BaseCommand):
help = "Export unaggregated raw data as CSV for a given time period"

help = """Export unaggregated raw data as CSV files for a given time period."""
directory = "/shared/data"

def add_arguments(self, parser):
Expand All @@ -32,39 +33,60 @@ def add_arguments(self, parser):
parser.add_argument(
"--selections",
"-s",
help="selected data tables to dump ",
help="selected data tables to dump: codebase, download, release, user",
default="codebase,download,release,user",
)

def _export(self, filename, table_name=None, select_statement=None):
def _export(
self,
filename,
table_name=None,
select_statement=None,
from_date=None,
end_date=None,
):
"""
Export data from a table or a select statement to a CSV file
"""
if not any([table_name, select_statement]):
raise ValueError(
"Must pass a valid table_name or select_statement parameter"
)
if select_statement is None:
select_statement = f"SELECT * FROM {table_name} ORDER BY id"
where_clause = ""
if all([from_date, end_date]):
where_clause = f"WHERE date_created >= '{from_date}' AND date_created <= '{end_date}'"
select_statement = f"""
SELECT * FROM {table_name}
{where_clause}
ORDER BY id
"""
destination_path = Path(self.directory) / filename
with connection.cursor() as cursor:
cursor.execute(
f"COPY ({select_statement}) TO '{destination_path}' WITH CSV HEADER"
)

def export_codebases(self):
self._export("codebases.csv", "library_codebase")
def export_codebases(self, **kwargs):
self._export("codebases.csv", "library_codebase", **kwargs)

def export_releases(self):
self._export("releases.csv", "library_codebaserelease")
def export_releases(self, **kwargs):
self._export("releases.csv", "library_codebaserelease", **kwargs)

def export_downloads(self):
self._export("downloads.csv", "library_codebasereleasedownload")
def export_downloads(self, **kwargs):
self._export("downloads.csv", "library_codebasereleasedownload", **kwargs)

def export_users(self):
join_user_member_profile_select = """
def export_users(self, from_date=None, end_date=None):
where_clause = ""
if all([from_date, end_date]):
where_clause = f"WHERE u.date_joined >= '{from_date}' AND u.date_joined <= '{end_date}'"
join_user_member_profile_select = f"""
SELECT
u.id, u.last_login, u.is_superuser, u.username, u.first_name, u.last_name, u.email, u.date_joined, u.is_active,
mp.affiliations, mp.bio, mp.degrees, mp.personal_url, mp.professional_url, mp.research_interests, mp.timezone,
mp.industry
FROM auth_user u INNER JOIN core_memberprofile mp ON u.id=mp.user_id
{where_clause}
ORDER BY u.id
"""
self._export("users.csv", select_statement=join_user_member_profile_select)
Expand All @@ -73,18 +95,18 @@ def handle(self, *args, **options):
"""
exports raw tabular data into postgres
"""
# FIXME: currently unused timerange filters, export all the data
from_date_string = options.get("from")
to_date_string = options.get("to")
self.directory = options["directory"]
selections = options["selections"].split(",")
date_constraints = {"from_date": from_date_string, "end_date": to_date_string}
os.makedirs(self.directory, exist_ok=True)
os.chmod(self.directory, 0o777)
if "codebase" in selections:
self.export_codebases()
self.export_codebases(**date_constraints)
if "release" in selections:
self.export_releases()
self.export_releases(**date_constraints)
if "download" in selections:
self.export_downloads()
self.export_downloads(**date_constraints)
if "user" in selections:
self.export_users()
self.export_users(**date_constraints)

0 comments on commit 9d31e82

Please sign in to comment.