From 75419e6674d34d5928eb0f7edde6bba24ce846ab Mon Sep 17 00:00:00 2001 From: Paul Bartell Date: Wed, 2 Jun 2021 11:16:57 -0700 Subject: [PATCH 1/2] Open link-verifier target files with encoding="utf8", errors='ignore' options Print each file name that is processed to stdout. --- link-verifier/verify-links.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/link-verifier/verify-links.py b/link-verifier/verify-links.py index e9d51d18..7c791827 100755 --- a/link-verifier/verify-links.py +++ b/link-verifier/verify-links.py @@ -335,7 +335,9 @@ def main(): dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs] for file in files: if any(file.endswith(file_type) for file_type in args.include_files): - with open(os.path.join(root, file), 'r') as f: + f_path = os.path.join(root, file) + print("Processing File: {}".format(f_path)) + with open(f_path, 'r', encoding="utf8", errors='ignore') as f: text = f.read() urls = re.findall(URL_SEARCH_TERM, text) for url in urls: From e95e2782ff81e53d53f985c046721089f7138fa2 Mon Sep 17 00:00:00 2001 From: Paul Bartell Date: Wed, 2 Jun 2021 12:16:19 -0700 Subject: [PATCH 2/2] Add comment regarding errors="ignore" argument --- link-verifier/verify-links.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/link-verifier/verify-links.py b/link-verifier/verify-links.py index 7c791827..d3e38427 100755 --- a/link-verifier/verify-links.py +++ b/link-verifier/verify-links.py @@ -338,6 +338,8 @@ def main(): f_path = os.path.join(root, file) print("Processing File: {}".format(f_path)) with open(f_path, 'r', encoding="utf8", errors='ignore') as f: + # errors='ignore' argument Suppresses UnicodeDecodeError + # when reading invalid UTF-8 characters. text = f.read() urls = re.findall(URL_SEARCH_TERM, text) for url in urls: