From 53ff74cb7f1223872ecece96e3010d4bed9bba13 Mon Sep 17 00:00:00 2001 From: Paul Bartell Date: Wed, 2 Jun 2021 13:24:40 -0700 Subject: [PATCH] Open link-verifier target files with encoding="utf8", errors='ignore' options (#29) * Open link-verifier target files with encoding="utf8", errors='ignore' options * Print each file path that is processed to stdout for debugging purposes. Co-authored-by: Archit Aggarwal --- link-verifier/verify-links.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/link-verifier/verify-links.py b/link-verifier/verify-links.py index e9d51d18..d3e38427 100755 --- a/link-verifier/verify-links.py +++ b/link-verifier/verify-links.py @@ -335,7 +335,11 @@ def main(): dirs[:] = [dir for dir in dirs if dir.lower() not in exclude_dirs] for file in files: if any(file.endswith(file_type) for file_type in args.include_files): - with open(os.path.join(root, file), 'r') as f: + f_path = os.path.join(root, file) + print("Processing File: {}".format(f_path)) + with open(f_path, 'r', encoding="utf8", errors='ignore') as f: + # errors='ignore' argument Suppresses UnicodeDecodeError + # when reading invalid UTF-8 characters. text = f.read() urls = re.findall(URL_SEARCH_TERM, text) for url in urls: