SETI · esimpsons3ti · Feb 18, 2025 · Feb 6, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/docs/pds4_create_xml_index.rst b/docs/pds4_create_xml_index.rst
@@ -195,10 +195,13 @@ Miscellaneous
 - ``--verbose``: Display detailed information during the file scraping process that may
   be useful for debugging.
 
+- ``--rename-headers``: Change the headers of the output file from their XPath/simplified
+  XPath counterparts to user-defined values via a given text file. Each line within the
+  text file must have the format ``<old_column_name>,<new_column_name>``.
+
 - ``--config-file``: Specify one or more YAML-style configuration files for further
   customization of the extraction process. See the section below for details.
 
-
 Configuration Files
 -------------------
 

diff --git a/pds4indextools/pds4_create_xml_index.py b/pds4indextools/pds4_create_xml_index.py
@@ -566,6 +566,49 @@ def split_xpath_prefix_and_num(s):
     return xpath_map
 
 
+def replace_columns(filepath, df_or_xpaths):
+    """
+    Replaces column names in a DataFrame or list of XPaths using a mapping file.
+
+    This function determines the output depending on whether an index file or a
+    headers file is being generated. If df_or_xpaths is a pandas.DataFrame object,
+    it will output a new pandas.DataFrame object. If df_or_xpaths is a list, it will
+    output a new list of column header values. There is also a check to ensure only
+    one replacement name for the column exists per line. Blank lines or lines that are
+    commented out will be ignored.
+
+    Parameters:
+        filepath (str): Path to the txt file containing old and new column names.
+        df_or_xpaths (pandas.DataFrame or list): the DataFrame or list containing the
+            original columns of the index/headers file.
+
+    Returns:
+        pandas.DataFrame or list: Updated DataFrame or updated XPaths list.
+    """
+    def load_mapping(file_path):
+        mapping = {}
+        with open(file_path, 'r') as file:
+            for line in file:
+                if not line.strip() or line.strip().startswith('#'):
+                    continue
+
+                parts = line.strip().split(',')
+                if len(parts) != 2:
+                    print(f"Invalid line in mapping file: {line.strip()}")
+                    sys.exit(1)
+
+                old_name, new_name = map(str.strip, parts)
+                mapping[old_name] = new_name
+        return mapping
+
+    mapping = load_mapping(filepath)
+
+    if isinstance(df_or_xpaths, pd.DataFrame):
+        return df_or_xpaths.rename(columns=mapping)
+
+    return [mapping.get(xpath, xpath) for xpath in df_or_xpaths]
+
+
 def split_into_elements(xpath):
     """
     Extract elements from an XPath in the order they appear.
@@ -714,7 +757,8 @@ def update_nillable_elements_from_xsd_file(xsd_file, nillable_elements_info):
                 nillable_elements_info[name] = 'External or built-in type'
 
 
-def write_results_to_csv(results_list, new_columns, args, output_csv_path):
+def write_results_to_csv(results_list, new_columns, elements_to_scrape, args,
+                         output_csv_path):
     """
     Write results from a list of dictionaries to a CSV file.
 
@@ -762,15 +806,15 @@ def pad_column_values_and_headers(df):
     df = pd.DataFrame(rows)
 
     if new_columns is not None:
-        new_columns_sorted = sorted(new_columns.items(), key=lambda x: x[1][0])
+        for col_name in elements_to_scrape:
+            if col_name in new_columns:
+                index, col_values = new_columns[col_name]
 
-        for col_name, (index, col_values) in new_columns_sorted:
-            # If the column already exists, remove it temporarily
-            if col_name in df.columns:
+                # Remove column if it already exists
                 df = df.drop(columns=[col_name])
 
-            # Insert the column at the desired index
-            df.insert(index, col_name, col_values)
+                # Insert in the correct position
+                df.insert(index, col_name, col_values)
 
     if (
         df.map(lambda x: isinstance(x, str) and ('"' in x))
@@ -799,6 +843,9 @@ def pad_column_values_and_headers(df):
             print(bad_sort)
             sys.exit(1)
 
+    if args.rename_headers:
+        df = replace_columns(args.rename_headers, df)
+
     if args.fixed_width:
         padded_df = pad_column_values_and_headers(df)
 
@@ -1301,6 +1348,11 @@ def main(cmd_line=None):
                            'file using additional --config-file arguments, in which case '
                            'each subsequent configuration file augments and overrides '
                            'the previous files.')
+    misc.add_argument('--rename-headers', type=str,
+                      metavar='RENAME_COLUMNS_FILEPATH',
+                      help='Optional text file mapping XPaths to new header names. Each '
+                           'line should contain an original XPath on the left and its '
+                           'replacement on the right. One entry per line.')
 
     args = parser.parse_args(cmd_line)
 
@@ -1369,9 +1421,10 @@ def main(cmd_line=None):
         and args.limit_xpaths_file
         and elements_to_scrape is not None
     ):
-        for x in elements_to_scrape:
-            if x in valid_add_extra_file_info:
-                extra_file_info_ind[x] = elements_to_scrape.index(x)
+        extra_file_info_ind = {
+            x: i for i, x in enumerate(elements_to_scrape)
+            if x in valid_add_extra_file_info
+        }
 
     # For each file in label_files, load in schema files and namespaces for reference.
     # Traverse the label file and scrape the desired contents. Place these contents
@@ -1467,11 +1520,12 @@ def main(cmd_line=None):
         all_results.append(label_results)
 
     for label_results in all_results:
-        if extra_file_info_ind != {}:
+        if extra_file_info_ind:
             new_columns = {}
-            for key in extra_file_info_ind.keys():
-                values = [d[key] for d in all_results]
-                new_columns[key] = (extra_file_info_ind[key], values)
+            for key in elements_to_scrape:
+                if key in extra_file_info_ind:
+                    values = [d[key] for d in all_results]
+                    new_columns[key] = (extra_file_info_ind[key], values)
         else:
             new_columns = None
 
@@ -1499,7 +1553,8 @@ def main(cmd_line=None):
                 original_headers[key] = key.split('/')[-1]
 
     if output_csv_path:
-        clean_header_mapping = write_results_to_csv(all_results, new_columns, args,
+        clean_header_mapping = write_results_to_csv(all_results, new_columns,
+                                                    elements_to_scrape, args,
                                                     output_csv_path)
 
     # To instead receive a list of available information available within a label or set
@@ -1517,22 +1572,34 @@ def main(cmd_line=None):
                     xpaths.append(xpath)
 
         if new_columns is not None:
-            # Sort new elements by index
-            new_elements_sorted = sorted(new_columns.items(), key=lambda x: x[1][0])
 
-            # Insert new elements into xpaths
-            for name, (index, value) in new_elements_sorted:
-                # Remove the value if it exists
-                if name in xpaths:
-                    xpaths.remove(name)
-                # Insert at the desired index
-                xpaths.insert(index, name)
+            # Create a new list to store the reordered elements
+            reordered_xpaths = [None] * (len(xpaths) + len(new_columns))
+
+            # Fill in known positions from new_columns
+            for col_name, (index, _) in new_columns.items():
+                reordered_xpaths[index] = col_name  # Place at correct index
+
+            # Fill in the rest of the elements while shifting to the left
+            xpath_idx = 0  # Index for iterating over
+            for i in range(len(reordered_xpaths)):
+                if reordered_xpaths[i] is None:  # If this slot isn't occupied
+                    while xpath_idx < len(xpaths) and xpaths[xpath_idx] in new_columns:
+                        xpath_idx += 1  # Skip over `filename` and `filepath`
+                    if xpath_idx < len(xpaths):
+                        reordered_xpaths[i] = xpaths[xpath_idx]  # Place original element
+                        xpath_idx += 1  # Move to next
+
+            # Remove any remaining `None` values (in case of overshoot)
+            xpaths = [x for x in reordered_xpaths if x is not None]
 
         # The file is now written and placed in a given location. If cleaned header
         # field names are requested, they are processed here before being written in.
         with open(output_txt_path, 'w') as output_fp:
             if args.simplify_xpaths:
                 xpaths = simplify_xpaths(xpaths)
+            if args.rename_headers:
+                xpaths = replace_columns(args.rename_headers, xpaths)
             for item in xpaths:
                 if args.clean_header_field_names:
                     verboseprint(

diff --git a/test_files/expected/extra_file_info_success_4.csv b/test_files/expected/extra_file_info_success_4.csv
@@ -0,0 +1,4 @@
+pds:logical_identifier<1>,filename,pds:version_id<1>,filepath,pds:title<1>
+urn:nasa:pds:cassini_iss_saturn:data_raw:1455200455n,tester_label_1.xml,1.0,labels/tester_label_1.xml,Cassini ISS Image 1455200455n.img
+urn:nasa:pds:uranus_occ_u149_irtf_320cm:data:2200nm_counts-v-time_occult,tester_label_2.xml,1.0,labels/tester_label_2.xml,Calibrated Time Series of the Uranus System Occultation of Star u149 (2MASS 20462044-1838345) Observed from the IRTF 320cm Telescope
+urn:nasa:pds:cassini_iss_cruise:data_raw:1357539630n,tester_label_3.xml,1.0,labels/tester_label_3.xml,Cassini ISS Image 1357539630n.img
diff --git a/test_files/expected/extra_file_info_success_4.txt b/test_files/expected/extra_file_info_success_4.txt
@@ -0,0 +1,5 @@
+pds:logical_identifier<1>
+filename
+pds:version_id<1>
+filepath
+pds:title<1>
diff --git a/test_files/expected/rename_headers_success_1.csv b/test_files/expected/rename_headers_success_1.csv
@@ -0,0 +1,2 @@
+spice_1,spice_2,spice_3,spice_4,spice_5
+ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls
diff --git a/test_files/expected/rename_headers_success_1.txt b/test_files/expected/rename_headers_success_1.txt
@@ -0,0 +1,5 @@
+spice_1
+spice_2
+spice_3
+spice_4
+spice_5
diff --git a/test_files/expected/rename_headers_success_2.csv b/test_files/expected/rename_headers_success_2.csv
@@ -0,0 +1,2 @@
+spice_1,pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>,spice_3,spice_4,spice_5
+ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls
diff --git a/test_files/expected/rename_headers_success_2.txt b/test_files/expected/rename_headers_success_2.txt
@@ -0,0 +1,5 @@
+spice_1
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>
+spice_3
+spice_4
+spice_5
diff --git a/test_files/labels/bad_quoted_label.xml b/test_files/labels/bad_quoted_label.xml
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.sch"
+    schematypens="http://purl.oclc.org/dsdl/schematron"?>
+
+<Product_Collection xmlns="http://pds.nasa.gov/pds4/pds/v1"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://pds.nasa.gov/pds4/pds/v1 https://pds.nasa.gov/pds4/pds/v1/PDS4_PDS_1B00.xsd">
+    <Identification_Area>
+        <logical_identifier>urn:nasa:pds:bundle_1:document</logical_identifier>
+        <version_id>1.1</version_id>
+        <title>Label 1</title>
+        <information_model_version>1.11.0.0</information_model_version>
+        <product_class>Product_Collection</product_class>
+        <Citation_Information>
+            <editor_list>"M. W. Evans"</editor_list>
+            <publication_year>2023</publication_year>
+            <description>
+                This is the first of three identical labels. 
+            </description>
+        </Citation_Information>
+	<Modification_History>
+            <Modification_Detail>
+                <modification_date>2024-05-21</modification_date>
+                <version_id>1.1</version_id>
+                <description>Updated LIDVID of Users Guide to reflect updated version</description>
+            </Modification_Detail>
+            <Modification_Detail>
+                <modification_date>2020-03-31</modification_date>
+                <version_id>1.0</version_id>
+                <description>Initial version</description>
+            </Modification_Detail>
+        </Modification_History>
+    </Identification_Area>
+    <Collection>
+        <collection_type>Document</collection_type>
+    </Collection>
+    <File_Area_Inventory>
+        <File>
+            <file_name>some_collection_1.csv</file_name>
+            <creation_date_time>2023-07-20T13:46:52</creation_date_time> 
+            <md5_checksum>3d330c619690d633c8f91a28cffd9756</md5_checksum>
+        </File>
+        <Inventory>
+            <offset unit="byte">0</offset>
+            <parsing_standard_id>PDS DSV 1</parsing_standard_id>
+            <records>3</records>
+            <record_delimiter>Carriage-Return Line-Feed</record_delimiter>
+            <field_delimiter>Comma</field_delimiter>
+            <Record_Delimited>
+                <fields>2</fields>
+                <groups>0</groups>
+                <Field_Delimited>
+                    <name>Member Status</name>
+                    <field_number>1</field_number>
+                    <data_type>ASCII_String</data_type>
+                    <maximum_field_length unit="byte">1</maximum_field_length>
+                </Field_Delimited>
+                <Field_Delimited>
+                    <name>LIDVID_LID</name>
+                    <field_number>2</field_number>
+                    <data_type>ASCII_LIDVID_LID</data_type>
+                    <maximum_field_length unit="byte">255</maximum_field_length>
+                </Field_Delimited>
+            </Record_Delimited>
+            <reference_type>inventory_has_member_product</reference_type>
+        </Inventory>
+    </File_Area_Inventory>
+</Product_Collection>
diff --git a/test_files/samples/element_with_filename.txt b/test_files/samples/element_with_filename.txt
@@ -0,0 +1,5 @@
+pds:Product_Observational/pds:Identification_Area<1>/pds:logical_identifier<1>
+filename
+pds:Product_Observational/pds:Identification_Area<1>/pds:version_id<1>
+filepath
+pds:Product_Observational/pds:Identification_Area<1>/pds:title<1>
diff --git a/test_files/samples/rename_headers_file.txt b/test_files/samples/rename_headers_file.txt
@@ -0,0 +1,5 @@
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5
diff --git a/test_files/samples/rename_headers_file_bad.txt b/test_files/samples/rename_headers_file_bad.txt
@@ -0,0 +1,5 @@
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1, spice_1_a
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2, spice_2_a
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3, spice_3_a
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4, spice_4_a
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5, spice_5_a
diff --git a/test_files/samples/rename_headers_file_blanks.txt b/test_files/samples/rename_headers_file_blanks.txt
@@ -0,0 +1,8 @@
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<1>/geom:spice_kernel_file_name<1>, spice_1
+
+# pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>, spice_2
+
+
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<3>/geom:spice_kernel_file_name<1>, spice_3
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<4>/geom:spice_kernel_file_name<1>, spice_4
+pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<5>/geom:spice_kernel_file_name<1>, spice_5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		spice_1,spice_2,spice_3,spice_4,spice_5
		ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    spice_1
+    spice_2
+    spice_3
+    spice_4
+    spice_5
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		spice_1,pds:Product_Observational/pds:Observation_Area<1>/pds:Discipline_Area<1>/geom:Geometry<1>/geom:SPICE_Kernel_Files<1>/geom:SPICE_Kernel_Identification<2>/geom:spice_kernel_file_name<1>,spice_3,spice_4,spice_5
		ura111.bsp,vgr2.ura111.bsp,earthstns_itrf93_040916.bsp,earth_720101_031229.bpc,naif0012.tls