Skip to content

Commit

Permalink
Update scripts, instructions, and removed and changelog files
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Jan 7, 2025
1 parent a674c97 commit 1bb464d
Show file tree
Hide file tree
Showing 10 changed files with 7,080 additions and 782 deletions.
9 changes: 8 additions & 1 deletion data-export-meta/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,11 @@ All scripts require pandas (`pip install pandas`).

- `readme_info.py` - use to generate dataset summary information for inclusion in plain-text readme (number of fields, number of rows, optional list of fields with descriptions); can also be used to generate a CSV data dictionary. Takes a path to the datapackage file; resource paths referenced in the datapackage must resolve.
- `member_changes.py` - for members in an old version not in the new version, creates a csv of changes with new ids for member ids that changed; requires pandas. Must be updated for new versions and should be added to changes from previous versions.
- `book_changes.py` - same as above, but for book ids
- `book_changes.py` - same as above, but for book ids

To generate updated the member and book changes:
- download the _removed.csv file from the previous release and rename it to remove the version
- update the script with the path to the member or book csv for the new version
- update the old and new versions in the script
- run the script; it will append to the existing removed.csv file in the current directory
- rename the updated removed file to include the version number and copy to the appropriate folder
21 changes: 12 additions & 9 deletions data-export-meta/book_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# pip install pandas
# usage:
# python member_changes.py
# python book_changes.py

import csv

Expand All @@ -14,17 +14,20 @@
import requests


# published v1.1 books dataset
books_previous = "https://dataspace.princeton.edu/bitstream/88435/dsp016d570067j/2/SCoData_books_v1.1_2021-01.csv"
# local copy of v1.2 (not yet published)
books_new = "SCoData_books_v1.2_2022-01.csv"
books_csv = {
# published v1.1 books dataset
"1.1": "https://dataspace.princeton.edu/bitstream/88435/dsp016d570067j/2/SCoData_books_v1.1_2021-01.csv",
# local copy of v1.2
"1.2": "v1.2/SCoData_books_v1.2_2022-01.csv",
"2.0": "v2.0/SCoData_books_v2.0_2025.csv",
}

if __name__ == "__main__":
old_version = "1.1"
new_version = "1.2"
books_prev_df = pd.read_csv(books_previous)
old_version = "1.2"
new_version = "2.0"
books_prev_df = pd.read_csv(books_csv[old_version])
# members_v1_1_df = pd.read_csv(members_v1_1)
books_df = pd.read_csv(books_new)
books_df = pd.read_csv(books_csv[new_version])

# identify members in new version not in the previous
# FIXME: probably not useful because of merge/rename
Expand Down
32 changes: 18 additions & 14 deletions data-export-meta/member_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,33 @@
import requests


# published v1 members dataset
members_v1 = "https://dataspace.princeton.edu/bitstream/88435/dsp0105741v63x/7/SCoData_members_v1_2020-07.csv"
# local copy of v1.1
# members_v1_1 = 'SCoData_members_v1.1_2021-01.csv'
# local copy of v1.2 (not yet published)
members_v1_2 = "SCoData_members_v1.2_2022-01.csv"
members_csv = {
# published v1 members dataset
"1.1": "https://dataspace.princeton.edu/bitstream/88435/dsp0105741v63x/7/SCoData_members_v1_2020-07.csv",
# local copy of v1.1
# members_v1_1 = 'SCoData_members_v1.1_2021-01.csv'
# local copy of v1.2 (not yet published)
"1.2": "v1.2/SCoData_members_v1.2_2022-01.csv",
"2.0": "v2.0/SCoData_members_v2.0_2025.csv",
}

if __name__ == "__main__":
old_version = "1.1"
new_version = "1.2"
members_v1_df = pd.read_csv(members_v1)
# members_v1_1_df = pd.read_csv(members_v1_1)
members_df = pd.read_csv(members_v1_2)
old_version = "1.2"
new_version = "2.0"
members_old_df = pd.read_csv(members_csv[old_version])
members_df = pd.read_csv(members_csv[new_version])

# identify members in new version not in the previous
# FIXME: not useful because of merge/rename
new_members = members_df[~members_df.uri.isin(members_v1_df.uri)]
new_members = members_df[~members_df.uri.isin(members_old_df.uri)]
print(
"%d new members in %s not included in %s"
% (len(new_members, old_version, new_version))
% (len(new_members), old_version, new_version)
)
new_uris = list(new_members.uri)

# identify members from previous version with uri not included in newer version
removed_members = members_v1_df[~members_v1_df.uri.isin(members_df.uri)]
removed_members = members_old_df[~members_old_df.uri.isin(members_df.uri)]
print(
"%d members from %s no longer included in %s"
% (len(removed_members), old_version, new_version)
Expand All @@ -51,7 +53,9 @@
# writer.writeheader()
defaults = {"in_version": old_version, "removed_version": new_version}
for member in removed_members.itertuples():
print(member.uri)
response = requests.get(member.uri, allow_redirects=False)
print(response)
info = defaults.copy()
info["uri"] = member.uri
if response.status_code == 301:
Expand Down
Loading

0 comments on commit 1bb464d

Please sign in to comment.