Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Centralize file downloads #2324

Merged
merged 10 commits into from
Dec 9, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements/ci-3.11.txt
Original file line number Diff line number Diff line change
@@ -69,6 +69,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.3.1
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
ruff==0.0.290
# via -r requirements/ci.in
typeguard==4.1.2
1 change: 1 addition & 0 deletions requirements/ci.in
Original file line number Diff line number Diff line change
@@ -17,3 +17,4 @@ pytest-cov
typeguard
types-dataclasses
types-Pillow
pyyaml
2 changes: 2 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
@@ -83,6 +83,8 @@ pytest-timeout==2.1.0
# via -r requirements/ci.in
pytest-xdist==3.0.2
# via -r requirements/ci.in
pyyaml==6.0.1
# via -r requirements/ci.in
six==1.16.0
# via flake8-print
tomli==1.2.3
74 changes: 51 additions & 23 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import concurrent.futures
import ssl
import urllib.request
from pathlib import Path
from typing import List
from typing import Dict, List, Optional
from urllib.error import HTTPError

import yaml

from pypdf.generic import DictionaryObject, IndirectObject


def get_data_from_url(url: str, name: str) -> bytes:
def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes:
"""
Download a File from a URL and return its contents.

@@ -22,28 +25,33 @@ def get_data_from_url(url: str, name: str) -> bytes:
Returns:
Read File as bytes
"""
if url.startswith("file://"):
with open(url[7:].replace("\\", "/"), "rb") as fp:
return fp.read()
if name is None:
raise ValueError("A name must always be specified")

cache_dir = Path(__file__).parent / "pdf_cache"
if not cache_dir.exists():
cache_dir.mkdir()
cache_path = cache_dir / name
if not cache_path.exists():
ssl._create_default_https_context = ssl._create_unverified_context
cpt = 3
while cpt > 0:
try:
with urllib.request.urlopen( # noqa: S310
url
) as response, cache_path.open("wb") as out_file:
out_file.write(response.read())
cpt = 0
except HTTPError as e:
if cpt > 0:
cpt -= 1
else:
raise e

if url is not None:
if url.startswith("file://"):
with open(url[7:].replace("\\", "/"), "rb") as fp:
return fp.read()
if not cache_path.exists():
ssl._create_default_https_context = ssl._create_unverified_context
cpt = 3
while cpt > 0:
try:
with urllib.request.urlopen( # noqa: S310
url
) as response, cache_path.open("wb") as out_file:
out_file.write(response.read())
cpt = 0
except HTTPError as e:
if cpt > 0:
cpt -= 1
else:
raise e
with open(cache_path, "rb") as fp:
data = fp.read()
return data
@@ -106,12 +114,32 @@ def is_sublist(child_list, parent_list):
return is_sublist(child_list, parent_list[1:])


def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]:
with open(yaml_file) as yaml_input:
data = yaml.safe_load(yaml_input)
return data


def download_test_pdfs():
"""
Run this before the tests are executed to ensure you have everything locally.

This is especially important to avoid pytest timeouts.
"""
pdfs = [("https://arxiv.org/pdf/2201.00214.pdf", "2201.00214.pdf")]
for url, name in pdfs:
get_data_from_url(url, name=name)
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml")

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [
executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"])
for pdf in pdfs
]
concurrent.futures.wait(futures)


def test_csv_consistency():
pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv")
# Ensure the names are unique
assert len(pdfs) == len({pdf["name"] for pdf in pdfs})

# Ensure the urls are unique
assert len(pdfs) == len({pdf["url"] for pdf in pdfs})
112 changes: 112 additions & 0 deletions tests/example_files.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
- local_filename: 2201.00214.pdf
url: https://arxiv.org/pdf/2201.00214.pdf
- local_filename: ASurveyofImageClassificationBasedTechniques.pdf
url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf
- local_filename: Giacalone.pdf
url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf
- local_filename: iss1718.pdf
url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf
- local_filename: iss2077.pdf
url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf
- local_filename: pdf_font_garbled.pdf
url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf
- local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf
url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf
- local_filename: tika-908104.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf
- local_filename: tika-923406.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf
- local_filename: tika-955562.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf
- local_filename: tika-959173.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf
- local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf
url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf
- local_filename: tika-957144.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf
- local_filename: ascii charset.pdf
url: https://github.com/py-pdf/pypdf/files/9472500/main.pdf
- local_filename: cmap1370.pdf
url: https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf
- local_filename: 02voc.pdf
url: https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf
- local_filename: iss1533.pdf
url: https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf
- local_filename: tstUCS2.pdf
url: https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf
- local_filename: tst-GBK_EUC.pdf
url: https://github.com/py-pdf/pypdf/files/11315397/3.pdf
- local_filename: math_latex.pdf
url: https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf
- local_filename: unixxx_glyphs.pdf
url: https://arxiv.org/pdf/2201.00021.pdf
- local_filename: TextAttack_paper.pdf
url: https://arxiv.org/pdf/2005.05909.pdf
- local_filename: iss2173.pdf
url: https://github.com/py-pdf/pypdf/files/12552700/tt.pdf
- local_filename: iss2290.pdf
url: https://github.com/py-pdf/pypdf/files/13452885/example.pdf
- local_filename: NewJersey.pdf
url: https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf
- local_filename: tika-952445.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf
- local_filename: tika-921632.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf
- local_filename: tika-976970.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf
- local_filename: tika-914102.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf
- local_filename: iss1737.pdf
url: https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf
- local_filename: issue-1801.pdf
url: https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf
- local_filename: tika-924546.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf
- local_filename: tika-924546.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf
- local_filename: issue-1801.png
url: https://user-images.githubusercontent.com/1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png
- local_filename: grimm10
url: https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf
- local_filename: labeled-edges-center-image.png
url: https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png
- local_filename: pdf_font_garbled.pdf
url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf
- local_filename: watermark1.png
url: https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png
- local_filename: tika-977609.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf
- local_filename: tifimage.png
url: https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png
- local_filename: tika-972174.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf
- local_filename: tika-972174_p0-im0.png
url: https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png
- local_filename: Vitocal.pdf
url: https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf
- local_filename: VitocalImage.png
url: https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg
- local_filename: cmyk_deflate.pdf
url: https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf
- local_filename: cmyk_deflate.tif
url: https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt
- local_filename: o1whh9b3.pdf
url: https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf
- local_filename: selbst.72916.pdf
url: https://www.selbst.de/paidcontent/dl/64733/72916
- local_filename: iss1912.pdf
url: https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf
- local_filename: calRGB.pdf
url: https://github.com/py-pdf/pypdf/files/12061061/tt.pdf
- local_filename: 2023USDC.pdf
url: https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf
- local_filename: iss1982_im1.png
url: https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt
- local_filename: iss1982_im2.png
url: https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt
- local_filename: tika-972174.pdf
url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf
- local_filename: usa.png
url: https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42
- local_filename: paid.pdf
url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf
Loading