Skip to content

Commit

Permalink
TST: Add test for decrypting files (#661)
Browse files Browse the repository at this point in the history
Also: Adjust ground truth "crazyones" example text
  • Loading branch information
MartinThoma authored Apr 6, 2022
1 parent 02cc54b commit 1cde559
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ jobs:
flake8 . --ignore E,F,I,SIM,C,PT,N,ASS,A,P,R,W
- name: Test with pytest
run: |
pytest Tests/tests.py --cov --cov-report term-missing -vv
pytest Tests/tests.py Tests --cov --cov-report term-missing -vv
2 changes: 1 addition & 1 deletion Resources/crazyones.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
TheCrazyOnesOctober14,1998Herestothecrazyones.Themis˝ts.Therebels.Thetroublemakers.Theroundpegsinthesquareholes.Theoneswhoseethingsdi˙erently.Theyrenotfondofrules.Andtheyhavenorespectforthestatusquo.Youcanquotethem,disagreewiththem,glorifyorvilifythem.Abouttheonlythingyoucantdoisignorethem.Becausetheychangethings.Theyinvent.Theyimagine.Theyheal.Theyexplore.Theycreate.Theyinspire.Theypushthehumanraceforward.Maybetheyhavetobecrazy.Howelsecanyoustareatanemptycanvasandseeaworkofart?Orsitinsilenceandhearasongthatsneverbeenwritten?Orgazeataredplanetandseealaboratoryonwheels?Wemaketoolsforthesekindsofpeople.Whilesomeseethemasthecrazyones,weseegenius.Becausethepeoplewhoarecrazyenoughtothinktheycanchangetheworld,aretheoneswhodo.
The Cr azy Ones Octob er 14, 1998 Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers. The round p egs in the square holes. The ones who see things di˙eren tly . Theyre not fond of rules. And they ha v e no resp ect for the status quo. Y ou can quote them, disagree with them, glorify or vilify them. Ab out the only thing y ou cant do is ignore them. Because they c hange things. They in v en t. They imagine. They heal. They explore. They create. They inspire. They push the h uman race forw ard. Ma yb e they ha v e to b e crazy . Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or sit in silence and hear a song thats nev er b een written? Or gaze at a red planet and see a lab oratory on wheels? W e mak e to ols for these kinds of p eople. While some see them as the crazy ones, w e see genius. Because the p eople who are crazy enough to think they can c hange the w orld, are the ones who do.
Binary file added Resources/libreoffice-writer-password.pdf
Binary file not shown.
71 changes: 71 additions & 0 deletions Tests/test_workflows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import binascii
import sys

from PyPDF2 import PdfFileReader, PdfFileWriter

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "Resources")

sys.path.append(PROJECT_ROOT)

def test_PdfReaderFileLoad():
"""
Test loading and parsing of a file. Extract text of the file and compare to expected
textual output. Expected outcome: file loads, text matches expected.
"""

with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile:
# Load PDF file from file
ipdf = PdfFileReader(inputfile)
ipdf_p1 = ipdf.getPage(0)

# Retrieve the text of the PDF
with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
pdftext = pdftext_file.read()

ipdf_p1_text = ipdf_p1.extractText().replace("\n", "").encode("utf-8")

# Compare the text of the PDF to a known source
assert ipdf_p1_text == pdftext, (
"PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
% (pdftext, ipdf_p1_text)
)


def test_PdfReaderJpegImage():
"""
Test loading and parsing of a file. Extract the image of the file and compare to expected
textual output. Expected outcome: file loads, image matches expected.
"""

with open(os.path.join(RESOURCE_ROOT, "jpeg.pdf"), "rb") as inputfile:
# Load PDF file from file
ipdf = PdfFileReader(inputfile)

# Retrieve the text of the image
with open(os.path.join(RESOURCE_ROOT, "jpeg.txt"), "r") as pdftext_file:
imagetext = pdftext_file.read()

ipdf_p0 = ipdf.getPage(0)
xObject = ipdf_p0["/Resources"]["/XObject"].getObject()
data = xObject["/Im4"].getData()

# Compare the text of the PDF to a known source
assert binascii.hexlify(data).decode() == imagetext, (
"PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n"
% (imagetext, binascii.hexlify(data).decode())
)


def test_decrypt():
with open(os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf"), "rb") as inputfile:
ipdf = PdfFileReader(inputfile)
assert ipdf.isEncrypted == True
ipdf.decrypt('openpassword')
assert ipdf.getNumPages() == 1
assert ipdf.isEncrypted == True

# Is extractText() broken for encrypted files?
# assert ipdf.getPage(0).extractText().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n"

0 comments on commit 1cde559

Please sign in to comment.