diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 33143f5cd..f382fe2b9 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -69,6 +69,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.3.1 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in ruff==0.0.290 # via -r requirements/ci.in typeguard==4.1.2 diff --git a/requirements/ci.in b/requirements/ci.in index ff071d125..4c14acc41 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -17,3 +17,4 @@ pytest-cov typeguard types-dataclasses types-Pillow +pyyaml diff --git a/requirements/ci.txt b/requirements/ci.txt index 0d2814426..ebb121aa6 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -83,6 +83,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.0.2 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in six==1.16.0 # via flake8-print tomli==1.2.3 diff --git a/tests/__init__.py b/tests/__init__.py index 9f5e28b74..d81f2c94c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,12 @@ import concurrent.futures -import csv import ssl import urllib.request from pathlib import Path from typing import Dict, List, Optional from urllib.error import HTTPError +import yaml + from pypdf.generic import DictionaryObject, IndirectObject @@ -113,15 +114,10 @@ def is_sublist(child_list, parent_list): return is_sublist(child_list, parent_list[1:]) -def read_csv_to_list_of_dicts(file_path: Path) -> List[Dict[str, str]]: - data_list = [] - - with open(file_path, newline="", encoding="utf-8") as file: - reader = csv.DictReader(file) - - data_list = list(reader) - - return data_list +def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]: + with open(yaml_file) as yaml_input: + data = yaml.safe_load(yaml_input) + return data def download_test_pdfs(): @@ -130,7 +126,7 @@ def download_test_pdfs(): This is especially important to avoid pytest timeouts. """ - pdfs = read_csv_to_list_of_dicts(Path(__file__).parent / "example_files.csv") + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml") with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: futures = [ @@ -141,7 +137,7 @@ def download_test_pdfs(): def test_csv_consistency(): - pdfs = read_csv_to_list_of_dicts(Path(__file__).parent / "example_files.csv") + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv") # Ensure the names are unique assert len(pdfs) == len({pdf["name"] for pdf in pdfs}) diff --git a/tests/example_files.csv b/tests/example_files.csv deleted file mode 100644 index f42e9fd43..000000000 --- a/tests/example_files.csv +++ /dev/null @@ -1,13 +0,0 @@ -local_filename,url -2201.00214.pdf,https://arxiv.org/pdf/2201.00214.pdf -ASurveyofImageClassificationBasedTechniques.pdf,https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf -Giacalone.pdf,https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf -iss1718.pdf,https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf -iss2077.pdf,https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf -pdf_font_garbled.pdf,https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf -The%20lean%20times%20in%20the%20Peruvian%20economy.pdf,https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf -tika-908104.pdf,https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf -tika-923406.pdf,https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf -tika-955562.pdf,https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf -tika-959173.pdf,https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf -waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf,https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf diff --git a/tests/example_files.yaml b/tests/example_files.yaml new file mode 100644 index 000000000..7ccc46ada --- /dev/null +++ b/tests/example_files.yaml @@ -0,0 +1,24 @@ +- local_filename: 2201.00214.pdf + url: https://arxiv.org/pdf/2201.00214.pdf +- local_filename: ASurveyofImageClassificationBasedTechniques.pdf + url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf +- local_filename: Giacalone.pdf + url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf +- local_filename: iss1718.pdf + url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf +- local_filename: iss2077.pdf + url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf +- local_filename: pdf_font_garbled.pdf + url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf +- local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf + url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf +- local_filename: tika-908104.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf +- local_filename: tika-923406.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf +- local_filename: tika-955562.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf +- local_filename: tika-959173.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf +- local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf + url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf