Skip to content

Commit

Permalink
Switch from CSV to YAML
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma committed Dec 3, 2023
1 parent 1f5ed08 commit 003f2c6
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 25 deletions.
2 changes: 2 additions & 0 deletions requirements/ci-3.11.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.3.1
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
ruff==0.0.290
# via -r requirements/ci.in
typeguard==4.1.2
Expand Down
1 change: 1 addition & 0 deletions requirements/ci.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ pytest-cov
typeguard
types-dataclasses
types-Pillow
pyyaml
2 changes: 2 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.0.2
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
six==1.16.0
# via flake8-print
tomli==1.2.3
Expand Down
20 changes: 8 additions & 12 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import concurrent.futures
import csv
import ssl
import urllib.request
from pathlib import Path
from typing import Dict, List, Optional
from urllib.error import HTTPError

import yaml

from pypdf.generic import DictionaryObject, IndirectObject


Expand Down Expand Up @@ -113,15 +114,10 @@ def is_sublist(child_list, parent_list):
return is_sublist(child_list, parent_list[1:])


def read_csv_to_list_of_dicts(file_path: Path) -> List[Dict[str, str]]:
data_list = []

with open(file_path, newline="", encoding="utf-8") as file:
reader = csv.DictReader(file)

data_list = list(reader)

return data_list
def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]:
with open(yaml_file) as yaml_input:
data = yaml.safe_load(yaml_input)
return data


def download_test_pdfs():
Expand All @@ -130,7 +126,7 @@ def download_test_pdfs():
This is especially important to avoid pytest timeouts.
"""
pdfs = read_csv_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [
Expand All @@ -141,7 +137,7 @@ def download_test_pdfs():


def test_csv_consistency():
pdfs = read_csv_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
# Ensure the names are unique
assert len(pdfs) == len({pdf["name"] for pdf in pdfs})

Expand Down
13 changes: 0 additions & 13 deletions tests/example_files.csv

This file was deleted.

24 changes: 24 additions & 0 deletions tests/example_files.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
- local_filename: 2201.00214.pdf
url: https://arxiv.org/pdf/2201.00214.pdf
- local_filename: ASurveyofImageClassificationBasedTechniques.pdf
url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf
- local_filename: Giacalone.pdf
url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf
- local_filename: iss1718.pdf
url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf
- local_filename: iss2077.pdf
url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf
- local_filename: pdf_font_garbled.pdf
url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf
- local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf
url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf
- local_filename: tika-908104.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf
- local_filename: tika-923406.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf
- local_filename: tika-955562.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf
- local_filename: tika-959173.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf
- local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf
url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf

0 comments on commit 003f2c6

Please sign in to comment.