diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 33143f5cd..f382fe2b9 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -69,6 +69,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.3.1 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in ruff==0.0.290 # via -r requirements/ci.in typeguard==4.1.2 diff --git a/requirements/ci.in b/requirements/ci.in index ff071d125..4c14acc41 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -17,3 +17,4 @@ pytest-cov typeguard types-dataclasses types-Pillow +pyyaml diff --git a/requirements/ci.txt b/requirements/ci.txt index 0d2814426..ebb121aa6 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -83,6 +83,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.0.2 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in six==1.16.0 # via flake8-print tomli==1.2.3 diff --git a/tests/__init__.py b/tests/__init__.py index c9f4dc3dd..d81f2c94c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,13 +1,16 @@ +import concurrent.futures import ssl import urllib.request from pathlib import Path -from typing import List +from typing import Dict, List, Optional from urllib.error import HTTPError +import yaml + from pypdf.generic import DictionaryObject, IndirectObject -def get_data_from_url(url: str, name: str) -> bytes: +def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes: """ Download a File from a URL and return its contents. @@ -22,28 +25,33 @@ def get_data_from_url(url: str, name: str) -> bytes: Returns: Read File as bytes """ - if url.startswith("file://"): - with open(url[7:].replace("\\", "/"), "rb") as fp: - return fp.read() + if name is None: + raise ValueError("A name must always be specified") + cache_dir = Path(__file__).parent / "pdf_cache" if not cache_dir.exists(): cache_dir.mkdir() cache_path = cache_dir / name - if not cache_path.exists(): - ssl._create_default_https_context = ssl._create_unverified_context - cpt = 3 - while cpt > 0: - try: - with urllib.request.urlopen( # noqa: S310 - url - ) as response, cache_path.open("wb") as out_file: - out_file.write(response.read()) - cpt = 0 - except HTTPError as e: - if cpt > 0: - cpt -= 1 - else: - raise e + + if url is not None: + if url.startswith("file://"): + with open(url[7:].replace("\\", "/"), "rb") as fp: + return fp.read() + if not cache_path.exists(): + ssl._create_default_https_context = ssl._create_unverified_context + cpt = 3 + while cpt > 0: + try: + with urllib.request.urlopen( # noqa: S310 + url + ) as response, cache_path.open("wb") as out_file: + out_file.write(response.read()) + cpt = 0 + except HTTPError as e: + if cpt > 0: + cpt -= 1 + else: + raise e with open(cache_path, "rb") as fp: data = fp.read() return data @@ -106,12 +114,32 @@ def is_sublist(child_list, parent_list): return is_sublist(child_list, parent_list[1:]) +def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]: + with open(yaml_file) as yaml_input: + data = yaml.safe_load(yaml_input) + return data + + def download_test_pdfs(): """ Run this before the tests are executed to ensure you have everything locally. This is especially important to avoid pytest timeouts. """ - pdfs = [("https://arxiv.org/pdf/2201.00214.pdf", "2201.00214.pdf")] - for url, name in pdfs: - get_data_from_url(url, name=name) + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml") + + with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: + futures = [ + executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"]) + for pdf in pdfs + ] + concurrent.futures.wait(futures) + + +def test_csv_consistency(): + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv") + # Ensure the names are unique + assert len(pdfs) == len({pdf["name"] for pdf in pdfs}) + + # Ensure the urls are unique + assert len(pdfs) == len({pdf["url"] for pdf in pdfs}) diff --git a/tests/example_files.yaml b/tests/example_files.yaml new file mode 100644 index 000000000..f12a78444 --- /dev/null +++ b/tests/example_files.yaml @@ -0,0 +1,112 @@ +- local_filename: 2201.00214.pdf + url: https://arxiv.org/pdf/2201.00214.pdf +- local_filename: ASurveyofImageClassificationBasedTechniques.pdf + url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf +- local_filename: Giacalone.pdf + url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf +- local_filename: iss1718.pdf + url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf +- local_filename: iss2077.pdf + url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf +- local_filename: pdf_font_garbled.pdf + url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf +- local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf + url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf +- local_filename: tika-908104.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf +- local_filename: tika-923406.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf +- local_filename: tika-955562.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf +- local_filename: tika-959173.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf +- local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf + url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf +- local_filename: tika-957144.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf +- local_filename: ascii charset.pdf + url: https://github.com/py-pdf/pypdf/files/9472500/main.pdf +- local_filename: cmap1370.pdf + url: https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf +- local_filename: 02voc.pdf + url: https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf +- local_filename: iss1533.pdf + url: https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf +- local_filename: tstUCS2.pdf + url: https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf +- local_filename: tst-GBK_EUC.pdf + url: https://github.com/py-pdf/pypdf/files/11315397/3.pdf +- local_filename: math_latex.pdf + url: https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf +- local_filename: unixxx_glyphs.pdf + url: https://arxiv.org/pdf/2201.00021.pdf +- local_filename: TextAttack_paper.pdf + url: https://arxiv.org/pdf/2005.05909.pdf +- local_filename: iss2173.pdf + url: https://github.com/py-pdf/pypdf/files/12552700/tt.pdf +- local_filename: iss2290.pdf + url: https://github.com/py-pdf/pypdf/files/13452885/example.pdf +- local_filename: NewJersey.pdf + url: https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf +- local_filename: tika-952445.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf +- local_filename: tika-921632.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf +- local_filename: tika-976970.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf +- local_filename: tika-914102.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf +- local_filename: iss1737.pdf + url: https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf +- local_filename: issue-1801.pdf + url: https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf +- local_filename: tika-924546.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf +- local_filename: tika-924546.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf +- local_filename: issue-1801.png + url: https://user-images.githubusercontent.com/1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png +- local_filename: grimm10 + url: https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf +- local_filename: labeled-edges-center-image.png + url: https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png +- local_filename: pdf_font_garbled.pdf + url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf +- local_filename: watermark1.png + url: https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png +- local_filename: tika-977609.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf +- local_filename: tifimage.png + url: https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png +- local_filename: tika-972174.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf +- local_filename: tika-972174_p0-im0.png + url: https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png +- local_filename: Vitocal.pdf + url: https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf +- local_filename: VitocalImage.png + url: https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg +- local_filename: cmyk_deflate.pdf + url: https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf +- local_filename: cmyk_deflate.tif + url: https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt +- local_filename: o1whh9b3.pdf + url: https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf +- local_filename: selbst.72916.pdf + url: https://www.selbst.de/paidcontent/dl/64733/72916 +- local_filename: iss1912.pdf + url: https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf +- local_filename: calRGB.pdf + url: https://github.com/py-pdf/pypdf/files/12061061/tt.pdf +- local_filename: 2023USDC.pdf + url: https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf +- local_filename: iss1982_im1.png + url: https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt +- local_filename: iss1982_im2.png + url: https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt +- local_filename: tika-972174.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf +- local_filename: usa.png + url: https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42 +- local_filename: paid.pdf + url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf diff --git a/tests/test_cmap.py b/tests/test_cmap.py index bf68d9e47..3c1150895 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -17,24 +17,24 @@ [ # compute_space_width: ( - "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf", + None, "tika-923406.pdf", False, ), # _parse_to_unicode_process_rg: ( - "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + None, "tika-959173.pdf", False, ), ( - "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + None, "tika-959173.pdf", True, ), # issue #1718: ( - "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf", + None, "iss1718.pdf", False, ), @@ -53,27 +53,24 @@ def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): [ # bfchar_on_2_chars: issue #1293 ( - "https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/" - "c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf", + None, "ASurveyofImageClassificationBasedTechniques.pdf", False, ), # L40, get_font_width_from_default ( - "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf", + None, "tika-908104.pdf", False, ), # multiline_bfrange / regression test for issue #1285: ( - "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/" - "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", + None, "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", False, ), ( - "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/" - "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf", + None, "Giacalone.pdf", False, ), @@ -89,10 +86,7 @@ def test_text_extraction_fast(caplog, url: str, name: str, strict: bool): @pytest.mark.enable_socket() def test_parse_encoding_advanced_encoding_not_implemented(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf" - name = "tika-957144.pdf" - - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf"))) with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"): for page in reader.pages: page.extract_text() @@ -100,10 +94,8 @@ def test_parse_encoding_advanced_encoding_not_implemented(): @pytest.mark.enable_socket() def test_ascii_charset(): - # iss #1312 - url = "https://github.com/py-pdf/pypdf/files/9472500/main.pdf" - name = "ascii charset.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # Issue #1312 + reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf"))) assert "/a" not in reader.pages[0].extract_text() @@ -112,13 +104,13 @@ def test_ascii_charset(): ("url", "name", "page_nb", "within_text"), [ ( - "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf", + None, "cmap1370.pdf", 0, "", ), ( - "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", + None, "02voc.pdf", 2, "Document delineation and character sequence decoding", @@ -135,9 +127,7 @@ def test_text_extraction_of_specific_pages( @pytest.mark.enable_socket() def test_iss1533(): - url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf" - name = "iss1533.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf"))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" @@ -147,14 +137,14 @@ def test_iss1533(): ("url", "name", "page_index", "within_text", "caplog_text"), [ ( - "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf", + None, "tstUCS2.pdf", 1, ["2 / 12", "S0490520090001", "于博"], "", ), ( - "https://github.com/py-pdf/pypdf/files/11315397/3.pdf", + None, "tst-GBK_EUC.pdf", 0, ["NJA", "中华男科学杂志"], @@ -172,9 +162,7 @@ def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text) @pytest.mark.enable_socket() def test_latex(): - url = "https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf" - name = "math_latex.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt @@ -183,9 +171,7 @@ def test_latex(): @pytest.mark.enable_socket() def test_unixxx_glyphs(): - url = "https://arxiv.org/pdf/2201.00021.pdf" - name = "unixxx_glyphs.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt @@ -195,27 +181,22 @@ def test_unixxx_glyphs(): def test_cmap_compute_space_width(): # issue 2137 # original file URL: - url = "https://arxiv.org/pdf/2005.05909.pdf" + # url = "https://arxiv.org/pdf/2005.05909.pdf" # URL from github issue is too long to pass code stype check, use original arxiv URL instead # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" - name = "TextAttack_paper.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf"))) reader.pages[0].extract_text() # no error @pytest.mark.enable_socket() def test_tabs_in_cmap(): """Issue #2173""" - url = "https://github.com/py-pdf/pypdf/files/12552700/tt.pdf" - name = "iss2173.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf"))) reader.pages[0].extract_text() @pytest.mark.enable_socket() def test_ignoring_non_put_entries(): """Issue #2290""" - url = "https://github.com/py-pdf/pypdf/files/13452885/example.pdf" - name = "iss2290.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text() diff --git a/tests/test_filters.py b/tests/test_filters.py index 00a548ab0..e38280244 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -142,9 +142,7 @@ def test_decode_ahx(): See #1979 Gray Image in CMYK : requiring reverse """ - url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" - name = "NewJersey.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="NewJersey.pdf"))) for p in reader.pages: _ = list(p.images.keys()) @@ -231,9 +229,7 @@ def test_ccitt_fax_decode(): @pytest.mark.enable_socket() @patch("pypdf._reader.logger_warning") def test_decompress_zlib_error(mock_logger_warning): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf" - name = "tika-952445.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf"))) for page in reader.pages: page.extract_text() mock_logger_warning.assert_called_with( @@ -243,9 +239,7 @@ def test_decompress_zlib_error(mock_logger_warning): @pytest.mark.enable_socket() def test_lzw_decode_neg1(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf" - name = "tika-921632.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf"))) page = reader.pages[47] with pytest.raises(PdfReadError) as exc: page.extract_text() @@ -254,17 +248,13 @@ def test_lzw_decode_neg1(): @pytest.mark.enable_socket() def test_issue_399(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf" - name = "tika-976970.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-976970.pdf"))) reader.pages[1].extract_text() @pytest.mark.enable_socket() def test_image_without_pillow(tmp_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf" name = "tika-914102.pdf" - _ = get_data_from_url(url, name=name) pdf_path = Path(__file__).parent / "pdf_cache" / name pdf_path_str = str(pdf_path.resolve()).replace("\\", "/") @@ -304,9 +294,7 @@ def test_image_without_pillow(tmp_path): @pytest.mark.enable_socket() def test_issue_1737(): - url = "https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf" - name = "iss1737.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1737.pdf"))) reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() @@ -319,9 +307,7 @@ def test_pa_image_extraction(): This is a regression test for issue #1801 """ - url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" - name = "issue-1801.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="issue-1801.pdf"))) page0 = reader.pages[0] images = page0.images @@ -329,20 +315,14 @@ def test_pa_image_extraction(): assert images[0].name == "Im1.png" # Ensure visual appearence - data = get_data_from_url( - "https://user-images.githubusercontent.com/" - "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", - "issue-1801.png", - ) + data = get_data_from_url(name="issue-1801.png") assert data == images[0].data @pytest.mark.enable_socket() def test_1bit_image_extraction(): """Cf issue #1814""" - url = "https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf" - name = "grimm10" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="grimm10"))) for p in reader.pages: p.images @@ -352,9 +332,9 @@ def test_png_transparency_reverse(): """Cf issue #1599""" pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" reader = PdfReader(pdf_path) - url_png = "https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png" - name_png = "labeled-edges-center-image.png" - _refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + _refimg = Image.open( + BytesIO(get_data_from_url(name="labeled-edges-center-image.png")) + ) data = reader.pages[0].images[0] _img = Image.open(BytesIO(data.data)) assert ".jp2" in data.name @@ -364,12 +344,8 @@ def test_png_transparency_reverse(): @pytest.mark.enable_socket() def test_iss1787(): """Cf issue #1787""" - url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" - name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png" - name_png = "watermark1.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="pdf_font_garbled.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="watermark1.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -385,12 +361,8 @@ def test_iss1787(): @pytest.mark.enable_socket() def test_tiff_predictor(): """Decode Tiff Predictor 2 Images""" - url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf" - name = "tika-977609.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png" - name_png = "tifimage.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-977609.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="tifimage.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -400,15 +372,11 @@ def test_tiff_predictor(): @pytest.mark.enable_socket() def test_rgba(): """Decode rgb with transparency""" - url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" - name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png" - name_png = "tika-972174_p0-im0.png" + reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) data = reader.pages[0].images[0] assert ".jp2" in data.name similarity = image_similarity( - data.image, BytesIO(get_data_from_url(url_png, name=name_png)) + data.image, BytesIO(get_data_from_url(name="tika-972174_p0-im0.png")) ) assert similarity > 0.99 @@ -421,23 +389,15 @@ def test_cmyk(): from Crypto.Cipher import AES # noqa: F401 except ImportError: return # the file is encrypted - url = "https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf" - name = "Vitocal.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg" - name_png = "VitocalImage.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + reader = PdfReader(BytesIO(get_data_from_url(name="Vitocal.pdf"))) + refimg = BytesIO(get_data_from_url(name="VitocalImage.png")) data = reader.pages[1].images[0] assert data.image.mode == "CMYK" assert ".jpg" in data.name assert image_similarity(data.image, refimg) > 0.99 # deflate - url = "https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf" - name = "cmyk_deflate.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt" - name_png = "cmyk_deflate.tif" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + reader = PdfReader(BytesIO(get_data_from_url(name="cmyk_deflate.pdf"))) + refimg = BytesIO(get_data_from_url(name="cmyk_deflate.tif")) data = reader.pages[0].images[0] assert data.image.mode == "CMYK" assert ".tif" in data.name @@ -447,9 +407,7 @@ def test_cmyk(): @pytest.mark.enable_socket() def test_iss1863(): """Test doc from iss1863""" - url = "https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf" - name = "o1whh9b3.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="o1whh9b3.pdf"))) for p in reader.pages: for i in p.images: i.name @@ -457,9 +415,7 @@ def test_iss1863(): @pytest.mark.enable_socket() def test_read_images(): - url = "https://www.selbst.de/paidcontent/dl/64733/72916" - name = "selbst.72916.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="selbst.72916.pdf"))) page = reader.pages[0] for _ in page.images: pass @@ -467,9 +423,7 @@ def test_read_images(): @pytest.mark.enable_socket() def test_cascaded_filters_images(): - url = "https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf" - name = "iss1912.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1912.pdf"))) # for focus, analyse the page 23 for p in reader.pages: for i in p.images: @@ -478,40 +432,28 @@ def test_cascaded_filters_images(): @pytest.mark.enable_socket() def test_calrgb(): - url = "https://github.com/py-pdf/pypdf/files/12061061/tt.pdf" - name = "calRGB.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="calRGB.pdf"))) reader.pages[0].images[0] @pytest.mark.enable_socket() def test_index_lookup(): """The lookup is provided as an str and bytes""" - url = "https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf" - name = "2023USDC.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="2023USDC.pdf"))) # TextStringObject Lookup - url_png = "https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt" - name_png = "iss1982_im1.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + refimg = BytesIO(get_data_from_url(name="iss1982_im1.png")) data = reader.pages[0].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # ByteStringObject Lookup - url_png = "https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt" - name_png = "iss1982_im2.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + refimg = BytesIO(get_data_from_url(name="iss1982_im2.png")) data = reader.pages[-1].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # indexed CMYK images # currently with a TODO as we convert to RBG the palette - url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" - name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42" - name_png = "usa.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="usa.png"))) data = reader.pages[0].images["/Im3"] # assert data.image.mode == "PA" but currently "RGBA" assert image_similarity(data.image, refimg) > 0.999 @@ -520,9 +462,7 @@ def test_index_lookup(): @pytest.mark.enable_socket() def test_2bits_image(): """From #1954, test with 2bits image. TODO: 4bits also""" - url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf" - name = "paid.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="paid.pdf"))) url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png" name_png = "Paid.png" refimg = BytesIO(get_data_from_url(url_png, name=name_png)) diff --git a/tests/test_images.py b/tests/test_images.py index a309549e6..3e9e8a034 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -105,9 +105,8 @@ def test_image_similarity_mid(): @pytest.mark.enable_socket() def test_image_new_property(): - url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name=name))) assert reader.pages[0].images.keys() == [ "/I0", "/I1", @@ -215,9 +214,7 @@ def test_image_extraction(src, page_index, image_key, expected): @pytest.mark.timeout(30) def test_loop_in_image_keys(): """Cf #2077""" - url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" - name = "iss2077.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss2077.pdf"))) reader.pages[0]["/Resources"]["/XObject"][NameObject("/toto")] = NullObject() reader.pages[0].images.keys() diff --git a/tests/test_page_labels.py b/tests/test_page_labels.py index 8b2e11b0d..1eb6f6aab 100644 --- a/tests/test_page_labels.py +++ b/tests/test_page_labels.py @@ -70,9 +70,8 @@ def test_number2uppercase_letter(): @pytest.mark.enable_socket() def test_index2label(caplog): - url = "https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" name = "waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" - r = PdfReader(BytesIO(get_data_from_url(url, name=name))) + r = PdfReader(BytesIO(get_data_from_url(name=name))) assert index2label(r, 1) == "ii" assert index2label(r, 9) == "6" # very silly data to get test cover diff --git a/tests/test_reader.py b/tests/test_reader.py index b252e48f9..555a3b2fe 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1027,17 +1027,13 @@ def test_header(src, pdf_header): @pytest.mark.enable_socket() def test_outline_color(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].color == [0, 0, 1] @pytest.mark.enable_socket() def test_outline_font_format(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].font_format == 2 diff --git a/tests/test_xmp.py b/tests/test_xmp.py index e01e5c6fd..f864a9df9 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -118,7 +118,7 @@ def test_identity_function(x): ("url", "name", "xmpmm_instance_id"), [ ( - "https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf", + None, "tika-955562.pdf", "uuid:ca96e032-c2af-49bd-a71c-95889bafbf1d", )