From 12c70472ba665e09ea5844f683a852a1c98079f3 Mon Sep 17 00:00:00 2001 From: Justin Frahm Date: Thu, 7 Apr 2022 11:22:58 -0600 Subject: [PATCH] ENH: Add options to customize extractText() (#334) These changes allow for an optional text separator for TJ and Tj operators. These source alterations were originally suggested in StackOverflow at: http://stackoverflow.com/questions/11017379/pypdf-ignores-newlines-in-pdf-file by DSM I'm just passing along the good suggestion in hopes that the change may become standard in some future version. --- PyPDF2/pdf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py index f932d7f15..47e9df978 100644 --- a/PyPDF2/pdf.py +++ b/PyPDF2/pdf.py @@ -2656,7 +2656,7 @@ def compressContentStreams(self): content = ContentStream(content, self.pdf) self[NameObject("/Contents")] = content.flateEncode() - def extractText(self): + def extractText(self, Tj_sep="", TJ_sep=" "): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF @@ -2678,6 +2678,7 @@ def extractText(self): if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): + text += Tj_sep text += _text text += "\n" elif operator == b_("T*"): @@ -2695,7 +2696,7 @@ def extractText(self): elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): - text += " " + text += TJ_sep text += i text += "\n" return text