Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDS3 maintenance update; new attributes in Pds3File #58

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions holdings_maintenance/pds3/crlf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
##########################################################################################
# crlf.py
#
# Program to validate and/or repair the CRLF line terminators in a file.
#
# Use:
# python crlf.py --repair file(s) # Repair any files that have invalid terminators
# python crlf.py file(s) # Identify any files with invalid terminators
#
# Files that are invalid are listed. Add the "--verbose" option to list all files checked,
# even if they are OK.
##########################################################################################

import sys

# Create a dictionary identifying non-ASCII characters with an "x"
NON_ASCIIS = {}
for c in range(32):
NON_ASCIIS[c] = 'x'
for c in range(32, 128):
NON_ASCIIS[c] = None
for c in range(128, 256):
NON_ASCIIS[c] = 'x'
NON_ASCIIS[ord('\r')] = None
NON_ASCIIS[ord('\n')] = None
NON_ASCIIS[ord('\t')] = None


def test_crlf(filepath, task='test', threshold=0.01):
"""Test the presence of CRLF line terminators in the given file and optionally rewrite
it.

Parameters:
filepath (str or pathlib.Path): path to the file.
task (str): "test" to test the file; "repair" to rewrite it if necessary.
threshold (float): Fraction of non-ASCII characters indicating that this is a
binary file. If the the fraction of non-ASCII characters exceeds this value,
the file is not modified and "binary" is returned

Returns:
str: "BINARY" if the file is binary; "REPAIRED" if the file was rewritten;
"INVALID" if the file has invalid line terminators; "OK" otherwise.
"""

if task not in {'test', 'repair'}:
raise ValueError('invalid task')

if not 0. <= threshold <= 1.:
raise ValueError('invalid threshold')

# Read the file as a byte string
with open(filepath, 'rb') as f:
content = f.read()

# Count the non-ASCII characters
content = content.decode('latin8')
non_asciis = len(content.translate(NON_ASCIIS))

# If the non-ASCII fraction is above the threshold, it's a binary file
if non_asciis/len(content) > threshold:
return 'BINARY'

# Split the file content into records
recs = content.split('\n')

# For each record not ending in CR, append the CR
repaired = False
for k, rec in enumerate(recs[:-1]):
if len(rec) == 0 or rec[-1] != '\r':
recs[k] = rec + '\r'
repaired = True

# Append CRLF at the end if it's missing
if recs[-1]:
recs[-1] += '\r\n'
repaired = True

# If the content has changed, rewrite the file
if repaired:
if task == 'repair':
content = '\n'.join(recs).encode('latin8')
with open(filepath, 'wb') as f:
f.write(content)
return 'REPAIRED'
return 'INVALID'

return 'OK'


if __name__ == '__main__':

task = 'test'
if '--repair' in sys.argv:
sys.argv.remove('--repair')
task = 'repair'

verbose = False
if '--verbose' in sys.argv:
sys.argv.remove('--verbose')
verbose = True

repairs = 0
invalid = 0
for path in sys.argv[1:]:
status = test_crlf(path, task=task)
if verbose or status in {'REPAIRED', 'INVALID'}:
print(path, status)
if status == 'REPAIRED':
repairs += 1
if status == 'INVALID':
invalid += 1

nfiles = len(sys.argv[1:])
if nfiles > 1:
if repairs:
if repairs == 1:
print(f'{repairs}/{nfiles} files repaired')
elif invalid:
print(f'{invalid}/{nfiles} files invalid')
else:
print(str(nfiles), 'files tested')
40 changes: 25 additions & 15 deletions holdings_maintenance/pds3/pdsdependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,15 @@
('.*/JNCJNC_0xxx/.*', 0, ['metadata', 'cumindex999']),
('.*/HST.x_xxxx/.*', 0, ['hst', 'metadata', 'cumindex9_9999']),
('.*/NH..(LO|MV)_xxxx/.*', 0, ['metadata', 'supplemental', 'cumindexNH']),
('.*/NH(JU|LA).._..../.*', 0, ['nhbrowse_vx', 'jupiter', 'rings', 'moons',
'inventory']),
('.*/NH(PC|PE).._..../.*', 0, ['nhbrowse', 'pluto', 'rings', 'moons',
'inventory']),
('.*/NH(KC|KE|K2).._..../.*', 0, ['nhbrowse']),
('.*/NH(JU|LA)LO_[12]00.*', 0, ['jupiter', 'rings', 'moons', 'inventory']),
('.*/NHP.LO_[12]00.*', 0, ['pluto', 'rings', 'moons', 'inventory']),
('.*/NH[LPK].LO_[12]00.*', 0, ['nhbrowse']),
('.*(?<!_v[12])/NHJULO_100.*', 0, ['nhbrowse']), # not NHJULO_1001 _v1-2
('.*(?<!_v[123])/NHJULO_200.*', 0, ['nhbrowse']), # not NHJULO_2001 _v1-3
('.*/NH[PK].MV_[12]00.*', 0, ['nhbrowse']),
('.*(?<!_v1)/NHLAMV_[12]00.*', 0, ['nhbrowse_vx']), # not LA _v1
('.*/NHJUMV_100.*', 0, ['nhbrowse_vx']),
('.*(?<!_v1)/NHJUMV_200.*', 0, ['nhbrowse_vx']), # not NHJUMV_2001 _v1
('.*/RPX_xxxx/.*', 0, ['metadata']),
('.*/RPX_xxxx/RPX_000.*', 0, ['obsindex', 'cumindex99']),
('.*/VGISS_[5678]xxx/.*', 0, ['vgiss', 'metadata', 'raw_image',
Expand Down Expand Up @@ -213,12 +217,12 @@ def test1(self, dirpath, check_newer=True, limit=200, logger=None):
confirmed = set()
try:
pattern = pdsdir.root_ + self.glob_pattern
pattern = pattern.replace('$', pdsdir.bundleset_[:-1], 1)
pattern = pattern.replace('$', pdsdir.volset_[:-1], 1)
if '$' in pattern:
if self.func is None:
volname = pdsdir.bundlename
volname = pdsdir.volname
else:
volname = self.func(pdsdir.bundlename, *self.args)
volname = self.func(pdsdir.volname, *self.args)
pattern = pattern.replace('$', volname, 1)

abspaths = glob.glob(pattern)
Expand All @@ -245,7 +249,6 @@ def test1(self, dirpath, check_newer=True, limit=200, logger=None):

(requirement, count) = self.regex.subn(sub, path)
absreq = (pdsdir.root_ + requirement)

if count == 0:
logger.error('Invalid test', absreq)
continue
Expand Down Expand Up @@ -886,28 +889,28 @@ def cumname(volname, nines):
r'<PREVIEW> [d]volumes/\1/\3.LBL -> [d]previews/\1/\3_*.jpg',
suite='hst', newer=False)

# For NHxxLO_xxxx and NHxxMV_xxxx browse, stripping version number
# For NHxxLO_xxxx and NHxxMV_xxxx browse, stripping version number if present
_ = PdsDependency(
'Previews of every NH image file',
'volumes/$/$/data/*/*.fit',
r'volumes/(NHxx.._....)(|_v[\.\d]+)/(NH.*?)(|_[0-9]+).fit',
r'volumes/(NHxx.._....)(|_v[\.\d]+)/(NH\w+/data/\w+/\w{24})(|_[0-9]+)\.fit',
[r'previews/\1/\3_thumb.jpg',
r'previews/\1/\3_small.jpg',
r'previews/\1/\3_med.jpg',
r'previews/\1/\3_full.jpg'],
r'<PREVIEW> [d]volumes/\1/\3\4.fit -> [d]previews/\1/\3_*.jpg',
r'<PREVIEW> [d]volumes/\1\2/\3\4.fit -> [d]previews/\1/\3_*.jpg',
suite='nhbrowse', newer=False)

# For NHxxLO_xxxx and NHxxMV_xxxx browse, without stripping version number
# For NHxxLO_xxxx and NHxxMV_xxxx browse, retaining version number
_ = PdsDependency(
'Previews of every NH image file',
'volumes/$/$/data/*/*.fit',
r'volumes/(NHxx.._....)(|_v[\.\d]+)/(NH.*?).fit',
r'volumes/(NHxx.._....)(|_v[\.\d]+)/(NH.*?)\.fit',
[r'previews/\1/\3_thumb.jpg',
r'previews/\1/\3_small.jpg',
r'previews/\1/\3_med.jpg',
r'previews/\1/\3_full.jpg'],
r'<PREVIEW> [d]volumes/\1/\3.fit -> [d]previews/\1/\3_*.jpg',
r'<PREVIEW> [d]volumes/\1\2/\3.fit -> [d]previews/\1/\3_*.jpg',
suite='nhbrowse_vx', newer=False)

# For VGISS_[5678]xxx
Expand Down Expand Up @@ -1061,6 +1064,13 @@ def main():
else:
paths.append(os.path.abspath(path))

# Check for valid volume IDs
for path in paths:
basename = os.path.basename(path)
if not pdsfile.Pds3File.VOLNAME_REGEX_I.match(basename):
print('Invalid volume ID: ' + path)
sys.exit(1)

# Loop through paths...
logger.open(' '.join(sys.argv))
try:
Expand Down
38 changes: 19 additions & 19 deletions holdings_maintenance/pds3/re-validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def validate_one_volume(pdsdir, voltypes, tests, args, logger):
tests_performed = 0

# Open logger for this volume
logfiles = set([pdsdir.log_path_for_bundle('_re-validate',
logfiles = set([pdsdir.log_path_for_volume('_re-validate',
dir='re-validate'),
pdsdir.log_path_for_bundle('_re-validate',
pdsdir.log_path_for_volume('_re-validate',
dir='re-validate',
place='parallel')])

Expand Down Expand Up @@ -83,7 +83,7 @@ def validate_one_volume(pdsdir, voltypes, tests, args, logger):
if not os.path.exists(abspath):
continue

temp_pdsdir = pdsfile.PdsFile.from_abspath(abspath)
temp_pdsdir = pdsfile.Pds3File.from_abspath(abspath)
if args.checksums:
logger.open('Checksum re-validatation for', abspath)
try:
Expand Down Expand Up @@ -113,7 +113,7 @@ def validate_one_volume(pdsdir, voltypes, tests, args, logger):
abspath = abspath[0] # there should only be one

(prefix, basename) = os.path.split(abspath)
temp_pdsdir = pdsfile.PdsFile.from_abspath(prefix)
temp_pdsdir = pdsfile.Pds3File.from_abspath(prefix)
logger.open('Checksum re-validatation for', abspath)
try:
pdschecksums.validate(temp_pdsdir, basename, logger)
Expand All @@ -128,7 +128,7 @@ def validate_one_volume(pdsdir, voltypes, tests, args, logger):
if not os.path.exists(abspath):
continue

temp_pdsdir = pdsfile.PdsFile.from_abspath(abspath)
temp_pdsdir = pdsfile.Pds3File.from_abspath(abspath)
if args.infoshelves:
logger.open('Infoshelf re-validatation for', abspath)
try:
Expand Down Expand Up @@ -159,7 +159,7 @@ def validate_one_volume(pdsdir, voltypes, tests, args, logger):
abspath = abspath[0] # there should only be one

(prefix, basename) = os.path.split(abspath)
temp_pdsdir = pdsfile.PdsFile.from_abspath(prefix)
temp_pdsdir = pdsfile.Pds3File.from_abspath(prefix)
logger.open('Infoshelf re-validatation for', abspath)
try:
pdsinfoshelf.validate(temp_pdsdir, basename, logger)
Expand Down Expand Up @@ -331,7 +331,7 @@ def get_volume_info(holdings):

info_list = []
for abspath in abspaths:
pdsdir = pdsfile.PdsFile.from_abspath(abspath)
pdsdir = pdsfile.Pds3File.from_abspath(abspath)
info_list.append((abspath, pdsdir.date))

return info_list
Expand Down Expand Up @@ -586,7 +586,7 @@ def send_email(to_addr, subject, message):
logger = pdslogger.PdsLogger(LOGNAME, limits=new_limits)

# Place to search for existing logs in batch mode
pdsfile.PdsFile.set_log_root(args.log)
pdsfile.Pds3File.set_log_root(args.log)

if not args.quiet:
logger.add_handler(pdslogger.stdout_handler)
Expand Down Expand Up @@ -617,14 +617,14 @@ def send_email(to_addr, subject, message):
roots = set()
for volume in args.volume:
abspath = os.path.abspath(volume)
pdsdir = pdsfile.PdsFile.from_abspath(abspath)
pdsdir = pdsfile.Pds3File.from_abspath(abspath)
if pdsdir.category_ != 'volumes/' or pdsdir.interior:
print('Not a volume path: ', pdsdir.abspath)
sys.exit(1)

logger.add_root(pdsdir.root_)

if pdsdir.bundlename:
if pdsdir.volname:
pdsdirs.append(pdsdir)
else:
for name in pdsdir.childnames:
Expand Down Expand Up @@ -677,7 +677,7 @@ def send_email(to_addr, subject, message):
holdings_abspaths = set(holdings_abspaths)

# Read the existing logs
(log_info, logs_for_volset_bundlename) = get_all_log_info(args.log)
(log_info, logs_for_volset_volname) = get_all_log_info(args.log)

# Read the current holdings
holdings_info = []
Expand All @@ -691,10 +691,10 @@ def send_email(to_addr, subject, message):

# Report missing volumes
for key in missing_keys:
# Determine if this volset/bundlename has ever appeared in any of the
# Determine if this volset has ever appeared in any of the
# holdings directory trees
holdings_for_key = set()
for log_path in logs_for_volset_bundlename[key]:
for log_path in logs_for_volset_volname[key]:
volume_abspath = volume_abspath_from_log(log_path)
if volume_abspath == '': # if log file is empty
continue
Expand All @@ -718,18 +718,18 @@ def send_email(to_addr, subject, message):
fmt = '%4d %20s%-11s modified %s, not previously validated'
line_number = 0
for (abspath, date) in modified_holdings:
pdsdir = pdsfile.PdsFile.from_abspath(abspath)
pdsdir = pdsfile.Pds3File.from_abspath(abspath)
line_number += 1
print(fmt % (line_number, pdsdir.bundleset_, pdsdir.bundlename,
print(fmt % (line_number, pdsdir.volset_, pdsdir.volname,
date[:10]))

fmt ='%4d %20s%-11s modified %s, last validated %s, duration %s%s'
for info in current_logs:
(start, elapsed, date, abspath, had_error, had_fatal) = info
pdsdir = pdsfile.PdsFile.from_abspath(abspath)
pdsdir = pdsfile.Pds3File.from_abspath(abspath)
error_text = ', error logged' if had_error else ''
line_number += 1
print(fmt % (line_number, pdsdir.bundleset_, pdsdir.bundlename,
print(fmt % (line_number, pdsdir.volset_, pdsdir.volname,
date[:10], start[:10], elapsed[:-7], error_text))

sys.exit()
Expand All @@ -753,13 +753,13 @@ def send_email(to_addr, subject, message):

# For each volume...
for (abspath, mod_date, prev_validation, had_errors) in info:
pdsdir = pdsfile.PdsFile.from_abspath(abspath)
pdsdir = pdsfile.Pds3File.from_abspath(abspath)
if prev_validation is None:
ps = 'not previously validated'
else:
ps = 'last validated %s' % prev_validation[:10]
batch_message = '%20s%-11s modified %s, %s' % \
(pdsdir.bundleset_, pdsdir.bundlename, mod_date[:10], ps)
(pdsdir.volset_, pdsdir.volname, mod_date[:10], ps)
print(batch_message)

(log_path,
Expand Down
Loading
Loading