Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New functionality: --replace-headers #45

Merged
merged 5 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/pds4_create_xml_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,13 @@ Miscellaneous
- ``--verbose``: Display detailed information during the file scraping process that may
be useful for debugging.

- ``--rename-headers``: Change the headers of the output file from their XPath/simplified
XPath counterparts to user-defined values via a given text file. Each line within the
text file must have the format ``<old_column_name>,<new_column_name>``.

- ``--config-file``: Specify one or more YAML-style configuration files for further
customization of the extraction process. See the section below for details.


Configuration Files
-------------------

Expand Down
115 changes: 91 additions & 24 deletions pds4indextools/pds4_create_xml_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,49 @@ def split_xpath_prefix_and_num(s):
return xpath_map


def replace_columns(filepath, df_or_xpaths):
"""
Replaces column names in a DataFrame or list of XPaths using a mapping file.

This function determines the output depending on whether an index file or a
headers file is being generated. If df_or_xpaths is a pandas.DataFrame object,
it will output a new pandas.DataFrame object. If df_or_xpaths is a list, it will
output a new list of column header values. There is also a check to ensure only
one replacement name for the column exists per line. Blank lines or lines that are
commented out will be ignored.

Parameters:
filepath (str): Path to the txt file containing old and new column names.
df_or_xpaths (pandas.DataFrame or list): the DataFrame or list containing the
original columns of the index/headers file.

Returns:
pandas.DataFrame or list: Updated DataFrame or updated XPaths list.
"""
def load_mapping(file_path):
mapping = {}
with open(file_path, 'r') as file:
for line in file:
if not line.strip() or line.strip().startswith('#'):
continue

parts = line.strip().split(',')
if len(parts) != 2:
print(f"Invalid line in mapping file: {line.strip()}")
sys.exit(1)

old_name, new_name = map(str.strip, parts)
mapping[old_name] = new_name
return mapping

mapping = load_mapping(filepath)

if isinstance(df_or_xpaths, pd.DataFrame):
return df_or_xpaths.rename(columns=mapping)

return [mapping.get(xpath, xpath) for xpath in df_or_xpaths]


def split_into_elements(xpath):
"""
Extract elements from an XPath in the order they appear.
Expand Down Expand Up @@ -714,7 +757,8 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info):
nillable_elements_info[name] = 'External or built-in type'


def write_results_to_csv(results_list, new_columns, args, output_csv_path):
def write_results_to_csv(results_list, new_columns, elements_to_scrape, args,
output_csv_path):
"""
Write results from a list of dictionaries to a CSV file.

Expand Down Expand Up @@ -762,15 +806,15 @@ def pad_column_values_and_headers(df):
df = pd.DataFrame(rows)

if new_columns is not None:
new_columns_sorted = sorted(new_columns.items(), key=lambda x: x[1][0])
for col_name in elements_to_scrape:
if col_name in new_columns:
index, col_values = new_columns[col_name]

for col_name, (index, col_values) in new_columns_sorted:
# If the column already exists, remove it temporarily
if col_name in df.columns:
# Remove column if it already exists
df = df.drop(columns=[col_name])

# Insert the column at the desired index
df.insert(index, col_name, col_values)
# Insert in the correct position
df.insert(index, col_name, col_values)

if (
df.map(lambda x: isinstance(x, str) and ('"' in x))
Expand Down Expand Up @@ -799,6 +843,9 @@ def pad_column_values_and_headers(df):
print(bad_sort)
sys.exit(1)

if args.rename_headers:
df = replace_columns(args.rename_headers, df)

if args.fixed_width:
padded_df = pad_column_values_and_headers(df)

Expand Down Expand Up @@ -1301,6 +1348,11 @@ def main(cmd_line=None):
'file using additional --config-file arguments, in which case '
'each subsequent configuration file augments and overrides '
'the previous files.')
misc.add_argument('--rename-headers', type=str,
metavar='RENAME_COLUMNS_FILEPATH',
help='Optional text file mapping XPaths to new header names. Each '
'line should contain an original XPath on the left and its '
'replacement on the right. One entry per line.')

args = parser.parse_args(cmd_line)

Expand Down Expand Up @@ -1369,9 +1421,10 @@ def main(cmd_line=None):
and args.limit_xpaths_file
and elements_to_scrape is not None
):
for x in elements_to_scrape:
if x in valid_add_extra_file_info:
extra_file_info_ind[x] = elements_to_scrape.index(x)
extra_file_info_ind = {
x: i for i, x in enumerate(elements_to_scrape)
if x in valid_add_extra_file_info
}

# For each file in label_files, load in schema files and namespaces for reference.
# Traverse the label file and scrape the desired contents. Place these contents
Expand Down Expand Up @@ -1467,11 +1520,12 @@ def main(cmd_line=None):
all_results.append(label_results)

for label_results in all_results:
if extra_file_info_ind != {}:
if extra_file_info_ind:
new_columns = {}
for key in extra_file_info_ind.keys():
values = [d[key] for d in all_results]
new_columns[key] = (extra_file_info_ind[key], values)
for key in elements_to_scrape:
if key in extra_file_info_ind:
values = [d[key] for d in all_results]
new_columns[key] = (extra_file_info_ind[key], values)
else:
new_columns = None

Expand Down Expand Up @@ -1499,7 +1553,8 @@ def main(cmd_line=None):
original_headers[key] = key.split('/')[-1]

if output_csv_path:
clean_header_mapping = write_results_to_csv(all_results, new_columns, args,
clean_header_mapping = write_results_to_csv(all_results, new_columns,
elements_to_scrape, args,
output_csv_path)

# To instead receive a list of available information available within a label or set
Expand All @@ -1517,22 +1572,34 @@ def main(cmd_line=None):
xpaths.append(xpath)

if new_columns is not None:
# Sort new elements by index
new_elements_sorted = sorted(new_columns.items(), key=lambda x: x[1][0])

# Insert new elements into xpaths
for name, (index, value) in new_elements_sorted:
# Remove the value if it exists
if name in xpaths:
xpaths.remove(name)
# Insert at the desired index
xpaths.insert(index, name)
# Create a new list to store the reordered elements
reordered_xpaths = [None] * (len(xpaths) + len(new_columns))

# Fill in known positions from new_columns
for col_name, (index, _) in new_columns.items():
reordered_xpaths[index] = col_name # Place at correct index

# Fill in the rest of the elements while shifting to the left
xpath_idx = 0 # Index for iterating over
for i in range(len(reordered_xpaths)):
if reordered_xpaths[i] is None: # If this slot isn't occupied
while xpath_idx < len(xpaths) and xpaths[xpath_idx] in new_columns:
xpath_idx += 1 # Skip over `filename` and `filepath`
if xpath_idx < len(xpaths):
reordered_xpaths[i] = xpaths[xpath_idx] # Place original element
xpath_idx += 1 # Move to next

# Remove any remaining `None` values (in case of overshoot)
xpaths = [x for x in reordered_xpaths if x is not None]

# The file is now written and placed in a given location. If cleaned header
# field names are requested, they are processed here before being written in.
with open(output_txt_path, 'w') as output_fp:
if args.simplify_xpaths:
xpaths = simplify_xpaths(xpaths)
if args.rename_headers:
xpaths = replace_columns(args.rename_headers, xpaths)
for item in xpaths:
if args.clean_header_field_names:
verboseprint(
Expand Down
4 changes: 4 additions & 0 deletions test_files/expected/extra_file_info_success_4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pds:logical_identifier<1>,filename,pds:version_id<1>,filepath,pds:title<1>
urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,tester_label_1.xml,1.0,labels/tester_label_1.xml,Cassini ISS Image 1455200455n.img
urn:nasa:pds:uranus_occ_u149_irtf_320cm:data:2200nm_counts-v-time_occult,tester_label_2.xml,1.0,labels/tester_label_2.xml,Calibrated Time Series of the Uranus System Occultation of Star u149 (2MASS 20462044-1838345) Observed from the IRTF 320cm Telescope
urn:nasa:pds:cassini_iss_cruise:data_raw:1357539630n,tester_label_3.xml,1.0,labels/tester_label_3.xml,Cassini ISS Image 1357539630n.img
5 changes: 5 additions & 0 deletions test_files/expected/extra_file_info_success_4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pds:logical_identifier<1>
filename
pds:version_id<1>
filepath
pds:title<1>
2 changes: 2 additions & 0 deletions test_files/expected/rename_headers_success_1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
spice_1,spice_2,spice_3,spice_4,spice_5
ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls
5 changes: 5 additions & 0 deletions test_files/expected/rename_headers_success_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
spice_1
spice_2
spice_3
spice_4
spice_5
2 changes: 2 additions & 0 deletions test_files/expected/rename_headers_success_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
spice_1,pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>,spice_3,spice_4,spice_5
ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls
5 changes: 5 additions & 0 deletions test_files/expected/rename_headers_success_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
spice_1
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>
spice_3
spice_4
spice_5
68 changes: 68 additions & 0 deletions test_files/labels/bad_quoted_label.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.sch"
schematypens="http://purl.oclc.org/dsdl/schematron"?>

<Product_Collection xmlns="http://pds.nasa.gov/pds4/pds/v1"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pds.nasa.gov/pds4/pds/v1 https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.xsd">
<Identification_Area>
<logical_identifier>urn:nasa:pds:bundle_1:document</logical_identifier>
<version_id>1.1</version_id>
<title>Label 1</title>
<information_model_version>1.11.0.0</information_model_version>
<product_class>Product_Collection</product_class>
<Citation_Information>
<editor_list>"M. W. Evans"</editor_list>
<publication_year>2023</publication_year>
<description>
This is the first of three identical labels.
</description>
</Citation_Information>
<Modification_History>
<Modification_Detail>
<modification_date>2024-05-21</modification_date>
<version_id>1.1</version_id>
<description>Updated LIDVID of Users Guide to reflect updated version</description>
</Modification_Detail>
<Modification_Detail>
<modification_date>2020-03-31</modification_date>
<version_id>1.0</version_id>
<description>Initial version</description>
</Modification_Detail>
</Modification_History>
</Identification_Area>
<Collection>
<collection_type>Document</collection_type>
</Collection>
<File_Area_Inventory>
<File>
<file_name>some_collection_1.csv</file_name>
<creation_date_time>2023-07-20T13:46:52</creation_date_time>
<md5_checksum>3d330c619690d633c8f91a28cffd9756</md5_checksum>
</File>
<Inventory>
<offset unit="byte">0</offset>
<parsing_standard_id>PDS DSV 1</parsing_standard_id>
<records>3</records>
<record_delimiter>Carriage-Return Line-Feed</record_delimiter>
<field_delimiter>Comma</field_delimiter>
<Record_Delimited>
<fields>2</fields>
<groups>0</groups>
<Field_Delimited>
<name>Member Status</name>
<field_number>1</field_number>
<data_type>ASCII_String</data_type>
<maximum_field_length unit="byte">1</maximum_field_length>
</Field_Delimited>
<Field_Delimited>
<name>LIDVID_LID</name>
<field_number>2</field_number>
<data_type>ASCII_LIDVID_LID</data_type>
<maximum_field_length unit="byte">255</maximum_field_length>
</Field_Delimited>
</Record_Delimited>
<reference_type>inventory_has_member_product</reference_type>
</Inventory>
</File_Area_Inventory>
</Product_Collection>
5 changes: 5 additions & 0 deletions test_files/samples/element_with_filename.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>
filename
pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>
filepath
pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>
5 changes: 5 additions & 0 deletions test_files/samples/rename_headers_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5
5 changes: 5 additions & 0 deletions test_files/samples/rename_headers_file_bad.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1, spice_1_a
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2, spice_2_a
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3, spice_3_a
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4, spice_4_a
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5, spice_5_a
8 changes: 8 additions & 0 deletions test_files/samples/rename_headers_file_blanks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1

# pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2


pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4
pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5
Loading