From 12c70472ba665e09ea5844f683a852a1c98079f3 Mon Sep 17 00:00:00 2001
From: Justin Frahm <justin.frahm@maxar.com>
Date: Thu, 7 Apr 2022 11:22:58 -0600
Subject: [PATCH] ENH: Add options to customize extractText() (#334)

These changes allow for an optional text separator for TJ and Tj operators.

These source alterations were originally suggested in StackOverflow at:
http://stackoverflow.com/questions/11017379/pypdf-ignores-newlines-in-pdf-file
by DSM

I'm just passing along the good suggestion in hopes that the change may become standard in some future version.
---
 PyPDF2/pdf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
index f932d7f15..47e9df978 100644
--- a/PyPDF2/pdf.py
+++ b/PyPDF2/pdf.py
@@ -2656,7 +2656,7 @@ def compressContentStreams(self):
                 content = ContentStream(content, self.pdf)
             self[NameObject("/Contents")] = content.flateEncode()
 
-    def extractText(self):
+    def extractText(self, Tj_sep="", TJ_sep=" "):
         """
         Locate all text drawing commands, in the order they are provided in the
         content stream, and extract the text.  This works well for some PDF
@@ -2678,6 +2678,7 @@ def extractText(self):
             if operator == b_("Tj"):
                 _text = operands[0]
                 if isinstance(_text, TextStringObject):
+                    text += Tj_sep
                     text += _text
                     text += "\n"
             elif operator == b_("T*"):
@@ -2695,7 +2696,7 @@ def extractText(self):
             elif operator == b_("TJ"):
                 for i in operands[0]:
                     if isinstance(i, TextStringObject):
-                        text += " "
+                        text += TJ_sep
                         text += i
                 text += "\n"
         return text