diff --git a/resources/crazyones_layout_vertical_space.txt b/resources/crazyones_layout_vertical_space.txt new file mode 100644 index 000000000..b745f6f63 --- /dev/null +++ b/resources/crazyones_layout_vertical_space.txt @@ -0,0 +1,19 @@ +The Crazy Ones +October 14, 1998 + + Heres to the crazy ones. The misfits. The rebels. The troublemakers. + The round pegs in the square holes. + The ones who see things differently. Theyre not fond of rules. And + they have no respect for the status quo. You can quote them, + disagree with them, glorify or vilify them. + About the only thing you cant do is ignore them. Because they change + things. They invent. They imagine. They heal. They explore. They + create. They inspire. They push the human race forward. + Maybe they have to be crazy. + How else can you stare at an empty canvas and see a work of art? Or + sit in silence and hear a song thats never been written? Or gaze at + a red planet and see a laboratory on wheels? + We make tools for these kinds of people. + While some see them as the crazy ones, we see genius. Because the + people who are crazy enough to think they can change the world, + are the ones who do. \ No newline at end of file diff --git a/resources/crazyones_layout_vertical_space_font_height_weight.txt b/resources/crazyones_layout_vertical_space_font_height_weight.txt new file mode 100644 index 000000000..e90fe87e9 --- /dev/null +++ b/resources/crazyones_layout_vertical_space_font_height_weight.txt @@ -0,0 +1,25 @@ +The Crazy Ones +October 14, 1998 + + Heres to the crazy ones. The misfits. The rebels. The troublemakers. + The round pegs in the square holes. + + The ones who see things differently. Theyre not fond of rules. And + they have no respect for the status quo. You can quote them, + disagree with them, glorify or vilify them. + + About the only thing you cant do is ignore them. Because they change + things. They invent. They imagine. They heal. They explore. They + create. They inspire. They push the human race forward. + + Maybe they have to be crazy. + + How else can you stare at an empty canvas and see a work of art? Or + sit in silence and hear a song thats never been written? Or gaze at + a red planet and see a laboratory on wheels? + + We make tools for these kinds of people. + + While some see them as the crazy ones, we see genius. Because the + people who are crazy enough to think they can change the world, + are the ones who do. \ No newline at end of file diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 1908e7f15..8c51b858f 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -219,3 +219,44 @@ def test_text_leading_height_unit(): page = reader.pages[0] extracted = page.extract_text() assert "Something[cited]\n" in extracted + + +def test_layout_mode_space_vertically_font_height_weight(): + """Tests layout mode with vertical space and font height weight (issue #2915)""" + with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: + # Load PDF file from file + reader = PdfReader(inputfile) + page = reader.pages[0] + + # Normal behaviour + with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file: + pdftext = pdftext_file.read() + + text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8") + + # Compare the text of the PDF to a known source + for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): + assert expected_line == actual_line + + pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows + assert text == pdftext, ( + "PDF extracted text differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) + ) + + # Blank lines are added to truly separate paragraphs + with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file: + pdftext = pdftext_file.read() + + text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True, + layout_mode_font_height_weight=0.85).encode("utf-8") + + # Compare the text of the PDF to a known source + for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()): + assert expected_line == actual_line + + pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows + assert text == pdftext, ( + "PDF extracted text differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) + )