text-extraction/fitzcli.py

# -----------------------------------------------------------------------------
# Copyright 2020-2021, Harald Lieder, mailto:harald.lieder@outlook.com
# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
# Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
# maintained and developed by Artifex Software, Inc. https://artifex.com.
# -----------------------------------------------------------------------------
import argparse
import os
import sys
import time
import bisect
import fitz
from typing import List
from fitz.fitz import (
    TEXT_INHIBIT_SPACES,
    TEXT_PRESERVE_LIGATURES,
    TEXT_PRESERVE_WHITESPACE,
)

mycenter = lambda x: (" %s " % x).center(75, "-")


def recoverpix(doc, item):
    """Return image for a given XREF."""
    x = item[0]  # xref of PDF image
    s = item[1]  # xref of its /SMask
    if s == 0:  # no smask: use direct image output
        return doc.extract_image(x)

    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    # we need to reconstruct the alpha channel with the smask
    pix1 = fitz.Pixmap(doc, x)
    pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry

    """Sanity check:
    - both pixmaps must have the same rectangle
    - both pixmaps must have alpha=0
    - pix2 must consist of 1 byte per pixel
    """
    if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
        print("Warning: unsupported /SMask %i for %i:" % (s, x))
        print(pix2)
        pix2 = None
        return getimage(pix1)  # return the pixmap as is

    pix = fitz.Pixmap(pix1)  # copy of pix1, with an alpha channel added
    pix.set_alpha(pix2.samples)  # treat pix2.samples as the alpha values
    pix1 = pix2 = None  # free temp pixmaps

    # we may need to adjust something for CMYK pixmaps here:
    return getimage(pix)


def open_file(filename, password, show=False, pdf=True):
    """Open and authenticate a document."""
    doc = fitz.open(filename)
    if not doc.is_pdf and pdf is True:
        sys.exit("this command supports PDF files only")
    rc = -1
    if not doc.needs_pass:
        return doc
    if password:
        rc = doc.authenticate(password)
        if not rc:
            sys.exit("authentication unsuccessful")
        if show is True:
            print("authenticated as %s" % "owner" if rc > 2 else "user")
    else:
        sys.exit("'%s' requires a password" % doc.name)
    return doc


def print_dict(item):
    """Print a Python dictionary."""
    l = max([len(k) for k in item.keys()]) + 1
    for k, v in item.items():
        msg = "%s: %s" % (k.rjust(l), v)
        print(msg)
    return


def print_xref(doc, xref):
    """Print an object given by XREF number.

    Simulate the PDF source in "pretty" format.
    For a stream also print its size.
    """
    print("%i 0 obj" % xref)
    xref_str = doc.xref_object(xref)
    print(xref_str)
    if doc.is_stream(xref):
        temp = xref_str.split()
        try:
            idx = temp.index("/Length") + 1
            size = temp[idx]
            if size.endswith("0 R"):
                size = "unknown"
        except:
            size = "unknown"
        print("stream\n...%s bytes" % size)
        print("endstream")
    print("endobj")


def get_list(rlist, limit, what="page"):
    """Transform a page / xref specification into a list of integers.

    Args
    ----
        rlist: (str) the specification
        limit: maximum number, i.e. number of pages, number of objects
        what: a string to be used in error messages
    Returns
    -------
        A list of integers representing the specification.
    """
    N = str(limit - 1)
    rlist = rlist.replace("N", N).replace(" ", "")
    rlist_arr = rlist.split(",")
    out_list = []
    for seq, item in enumerate(rlist_arr):
        n = seq + 1
        if item.isdecimal():  # a single integer
            i = int(item)
            if 1 <= i < limit:
                out_list.append(int(item))
            else:
                sys.exit("bad %s specification at item %i" % (what, n))
            continue
        try:  # this must be a range now, and all of the following must work:
            i1, i2 = item.split("-")  # will fail if not 2 items produced
            i1 = int(i1)  # will fail on non-integers
            i2 = int(i2)
        except:
            sys.exit("bad %s range specification at item %i" % (what, n))

        if not (1 <= i1 < limit and 1 <= i2 < limit):
            sys.exit("bad %s range specification at item %i" % (what, n))

        if i1 == i2:  # just in case: a range of equal numbers
            out_list.append(i1)
            continue

        if i1 < i2:  # first less than second
            out_list += list(range(i1, i2 + 1))
        else:  # first larger than second
            out_list += list(range(i1, i2 - 1, -1))

    return out_list


def show(args):
    doc = open_file(args.input, args.password, True)
    size = os.path.getsize(args.input) / 1024
    flag = "KB"
    if size > 1000:
        size /= 1024
        flag = "MB"
    size = round(size, 1)
    meta = doc.metadata
    print(
        "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
        % (
            args.input,
            doc.page_count,
            doc.xref_length() - 1,
            size,
            flag,
            meta["format"],
            meta["encryption"],
        )
    )
    n = doc.is_form_pdf
    if n > 0:
        s = doc.get_sigflags()
        print(
            "document contains %i root form fields and is %ssigned"
            % (n, "not " if s != 3 else "")
        )
    n = doc.embfile_count()
    if n > 0:
        print("document contains %i embedded files" % n)
    print()
    if args.catalog:
        print(mycenter("PDF catalog"))
        xref = doc.pdf_catalog()
        print_xref(doc, xref)
        print()
    if args.metadata:
        print(mycenter("PDF metadata"))
        print_dict(doc.metadata)
        print()
    if args.xrefs:
        print(mycenter("object information"))
        xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
        for xref in xrefl:
            print_xref(doc, xref)
            print()
    if args.pages:
        print(mycenter("page information"))
        pagel = get_list(args.pages, doc.page_count + 1)
        for pno in pagel:
            n = pno - 1
            xref = doc.page_xref(n)
            print("Page %i:" % pno)
            print_xref(doc, xref)
            print()
    if args.trailer:
        print(mycenter("PDF trailer"))
        print(doc.pdf_trailer())
        print()
    doc.close()


def poster(args):
    doc = open_file(args.input, args.password, pdf=True)
    pages = get_list(args.pages, doc.page_count + 1)
    outdoc = fitz.open()
    outdoc.set_metadata(doc.metadata)
    for pno in pages:
        n = pno - 1
        page = doc[n]
        rect = page.rect
        w = rect.width / args.x
        h = rect.height / args.y
        rects = []
        for i in range(args.y):
            for j in range(args.x):
                r = fitz.Rect(j * w, i * h, (j + 1) * w, (i + 1) * h)
                opage = outdoc.new_page(width=w, height=h)
                opage.show_pdf_page(opage.rect, doc, n, clip=r)

    outdoc.ez_save(args.output)
    doc.close()
    outdoc.close()
    return


def clean(args):
    doc = open_file(args.input, args.password, pdf=True)
    encryption = args.encryption
    encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
        encryption
    )

    if not args.pages:  # simple cleaning
        doc.save(
            args.output,
            garbage=args.garbage,
            deflate=args.compress,
            pretty=args.pretty,
            clean=args.sanitize,
            ascii=args.ascii,
            linear=args.linear,
            encryption=encrypt,
            owner_pw=args.owner,
            user_pw=args.user,
            permissions=args.permission,
        )
        return

    # create sub document from page numbers
    pages = get_list(args.pages, doc.page_count + 1)
    outdoc = fitz.open()
    for pno in pages:
        n = pno - 1
        outdoc.insert_pdf(doc, from_page=n, to_page=n)
    outdoc.set_metadata(doc.metadata)
    outdoc.save(
        args.output,
        garbage=args.garbage,
        deflate=args.compress,
        pretty=args.pretty,
        clean=args.sanitize,
        ascii=args.ascii,
        linear=args.linear,
        encryption=encrypt,
        owner_pw=args.owner,
        user_pw=args.user,
        permissions=args.permission,
    )
    doc.close()
    outdoc.close()
    return


def doc_join(args):
    """Join pages from several PDF documents."""
    doc_list = args.input  # a list of input PDFs
    doc = fitz.open()  # output PDF
    for src_item in doc_list:  # process one input PDF
        src_list = src_item.split(",")
        password = src_list[1] if len(src_list) > 1 else None
        src = open_file(src_list[0], password, pdf=True)
        pages = ",".join(src_list[2:])  # get 'pages' specifications
        if pages:  # if anything there, retrieve a list of desired pages
            page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
        else:  # take all pages
            page_list = range(1, src.page_count + 1)
        for i in page_list:
            doc.insert_pdf(src, from_page=i - 1, to_page=i - 1)  # copy each source page
        src.close()

    doc.save(args.output, garbage=4, deflate=True)
    doc.close()


def embedded_copy(args):
    """Copy embedded files between PDFs."""
    doc = open_file(args.input, args.password, pdf=True)
    if not doc.can_save_incrementally() and (
        not args.output or args.output == args.input
    ):
        sys.exit("cannot save PDF incrementally")
    src = open_file(args.source, args.pwdsource)
    names = set(args.name) if args.name else set()
    src_names = set(src.embfile_names())
    if names:
        if not names <= src_names:
            sys.exit("not all names are contained in source")
    else:
        names = src_names
    if not names:
        sys.exit("nothing to copy")
    intersect = names & set(doc.embfile_names())  # any equal name already in target?
    if intersect:
        sys.exit("following names already exist in receiving PDF: %s" % str(intersect))

    for item in names:
        info = src.embfile_info(item)
        buff = src.embfile_get(item)
        doc.embfile_add(
            item,
            buff,
            filename=info["filename"],
            ufilename=info["ufilename"],
            desc=info["desc"],
        )
        print("copied entry '%s' from '%s'" % (item, src.name))
    src.close()
    if args.output and args.output != args.input:
        doc.save(args.output, garbage=3)
    else:
        doc.saveIncr()
    doc.close()


def embedded_del(args):
    """Delete an embedded file entry."""
    doc = open_file(args.input, args.password, pdf=True)
    if not doc.can_save_incrementally() and (
        not args.output or args.output == args.input
    ):
        sys.exit("cannot save PDF incrementally")

    try:
        doc.embfile_del(args.name)
    except ValueError:
        sys.exit("no such embedded file '%s'" % args.name)
    if not args.output or args.output == args.input:
        doc.save_incr()
    else:
        doc.save(args.output, garbage=1)
    doc.close()


def embedded_get(args):
    """Retrieve contents of an embedded file."""
    doc = open_file(args.input, args.password, pdf=True)
    try:
        stream = doc.embfile_get(args.name)
        d = doc.embfile_info(args.name)
    except ValueError:
        sys.exit("no such embedded file '%s'" % args.name)
    filename = args.output if args.output else d["filename"]
    output = open(filename, "wb")
    output.write(stream)
    output.close()
    print("saved entry '%s' as '%s'" % (args.name, filename))
    doc.close()


def embedded_add(args):
    """Insert a new embedded file."""
    doc = open_file(args.input, args.password, pdf=True)
    if not doc.can_save_incrementally() and (
        args.output is None or args.output == args.input
    ):
        sys.exit("cannot save PDF incrementally")

    try:
        doc.embfile_del(args.name)
        sys.exit("entry '%s' already exists" % args.name)
    except:
        pass

    if not os.path.exists(args.path) or not os.path.isfile(args.path):
        sys.exit("no such file '%s'" % args.path)
    stream = open(args.path, "rb").read()
    filename = args.path
    ufilename = filename
    if not args.desc:
        desc = filename
    else:
        desc = args.desc
    doc.embfile_add(
        args.name, stream, filename=filename, ufilename=ufilename, desc=desc
    )
    if not args.output or args.output == args.input:
        doc.saveIncr()
    else:
        doc.save(args.output, garbage=3)
    doc.close()


def embedded_upd(args):
    """Update contents or metadata of an embedded file."""
    doc = open_file(args.input, args.password, pdf=True)
    if not doc.can_save_incrementally() and (
        args.output is None or args.output == args.input
    ):
        sys.exit("cannot save PDF incrementally")

    try:
        doc.embfile_info(args.name)
    except:
        sys.exit("no such embedded file '%s'" % args.name)

    if (
        args.path is not None
        and os.path.exists(args.path)
        and os.path.isfile(args.path)
    ):
        stream = open(args.path, "rb").read()
    else:
        stream = None

    if args.filename:
        filename = args.filename
    else:
        filename = None

    if args.ufilename:
        ufilename = args.ufilename
    elif args.filename:
        ufilename = args.filename
    else:
        ufilename = None

    if args.desc:
        desc = args.desc
    else:
        desc = None

    doc.embfile_upd(
        args.name, stream, filename=filename, ufilename=ufilename, desc=desc
    )
    if args.output is None or args.output == args.input:
        doc.saveIncr()
    else:
        doc.save(args.output, garbage=3)
    doc.close()


def embedded_list(args):
    """List embedded files."""
    doc = open_file(args.input, args.password, pdf=True)
    names = doc.embfile_names()
    if args.name is not None:
        if args.name not in names:
            sys.exit("no such embedded file '%s'" % args.name)
        else:
            print()
            print(
                "printing 1 of %i embedded file%s:"
                % (len(names), "s" if len(names) > 1 else "")
            )
            print()
            print_dict(doc.embfile_info(args.name))
            print()
            return
    if not names:
        print("'%s' contains no embedded files" % doc.name)
        return
    if len(names) > 1:
        msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
    else:
        msg = "'%s' contains the following embedded file" % doc.name
    print(msg)
    print()
    for name in names:
        if not args.detail:
            print(name)
            continue
        _ = doc.embfile_info(name)
        print_dict(doc.embfile_info(name))
        print()
    doc.close()


def extract_objects(args):
    """Extract images and / or fonts from a PDF."""
    if not args.fonts and not args.images:
        sys.exit("neither fonts nor images requested")
    doc = open_file(args.input, args.password, pdf=True)

    if args.pages:
        pages = get_list(args.pages, doc.page_count + 1)
    else:
        pages = range(1, doc.page_count + 1)

    if not args.output:
        out_dir = os.path.abspath(os.curdir)
    else:
        out_dir = args.output
        if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
            sys.exit("output directory %s does not exist" % out_dir)

    font_xrefs = set()  # already saved fonts
    image_xrefs = set()  # already saved images

    for pno in pages:
        if args.fonts:
            itemlist = doc.get_page_fonts(pno - 1)
            for item in itemlist:
                xref = item[0]
                if xref not in font_xrefs:
                    font_xrefs.add(xref)
                    fontname, ext, _, buffer = doc.extract_font(xref)
                    if ext == "n/a" or not buffer:
                        continue
                    outname = os.path.join(
                        out_dir, fontname.replace(" ", "-") + "." + ext
                    )
                    outfile = open(outname, "wb")
                    outfile.write(buffer)
                    outfile.close()
                    buffer = None
        if args.images:
            itemlist = doc.get_page_images(pno - 1)
            for item in itemlist:
                xref = item[0]
                if xref not in image_xrefs:
                    image_xrefs.add(xref)
                    pix = recoverpix(doc, item)
                    if type(pix) is dict:
                        ext = pix["ext"]
                        imgdata = pix["image"]
                        outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
                        outfile = open(outname, "wb")
                        outfile.write(imgdata)
                        outfile.close()
                    else:
                        outname = os.path.join(out_dir, "img-%i.png" % xref)
                        pix2 = (
                            pix
                            if pix.colorspace.n < 4
                            else fitz.Pixmap(fitz.csRGB, pix)
                        )
                        pix2.save(outname)

    if args.fonts:
        print("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
    if args.images:
        print("saved %i images to '%s'" % (len(image_xrefs), out_dir))
    doc.close()


def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
    if noformfeed:
        eop = b"\n"
    else:
        eop = bytes([12])
    text = page.get_text("text", flags=flags)
    if not text:
        if not skip_empty:
            textout.write(eop)  # write formfeed
        return
    textout.write(text.encode("utf8", errors="surrogatepass"))
    textout.write(eop)
    return


def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
    if noformfeed:
        eop = b"\n"
    else:
        eop = bytes([12])
    blocks = page.get_text("blocks", flags=flags)
    if blocks == []:
        if not skip_empty:
            textout.write(eop)  # write formfeed
        return
    blocks.sort(key=lambda b: (b[3], b[0]))
    for b in blocks:
        textout.write(b[4].encode("utf8", errors="surrogatepass"))
    textout.write(eop)
    return


def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
    left = page.rect.width  # left most used coordinate
    right = 0  # rightmost coordinate
    rowheight = page.rect.height  # smallest row height in use
    chars = []  # all chars here
    rows = set()  # bottom coordinates of lines
    if noformfeed:
        eop = b"\n"
    else:
        eop = bytes([12])

    # --------------------------------------------------------------------
    def curate_rows(rows, GRID):
        """Make list of integer y-coordinates of lines on page.

        Coordinates will be ascending and differ by 'GRID' points or more."""
        rows = list(rows)
        rows.sort()  # sort ascending
        nrows = [rows[0]]
        for h in rows[1:]:
            if h >= nrows[-1] + GRID:  # only keep significant differences
                nrows.append(h)
        return nrows  # curated list of line bottom coordinates

    # --------------------------------------------------------------------
    def curate_rows(rows, GRID):
        """Make list of integer y-coordinates of lines on page.

        Coordinates will be ascending and differ by 'GRID' points or more."""
        rows = list(rows)
        rows.sort()  # sort ascending
        nrows = [rows[0]]
        for h in rows[1:]:
            if h >= nrows[-1] + GRID:  # only keep significant differences
                nrows.append(h)
        return nrows  # curated list of line bottom coordinates

    # --------------------------------------------------------------------
    def find_line_index(values: List[int], value: int) -> int:
        """Find the right row coordinate (using bisect std package).

        Args:
            values: (list) y-coordinates of rows.
            value: (int) lookup for this value (y-origin of char).
        Returns:
            y-ccordinate of appropriate line for value.
        """
        i = bisect.bisect_right(values, value)
        if i:
            return values[i - 1]
        raise RuntimeError("Line for %g not found in %s" % (value, values))

    # --------------------------------------------------------------------
    def make_lines(chars):
        lines = {}  # key: y1-ccordinate, value: char list
        for c in chars:
            ch, ox, oy, cwidth = c
            y = find_line_index(rows, oy)  # index of origin.y
            lchars = lines.get(y, [])  # read line chars so far
            lchars.append(c)
            lines[y] = lchars  # write back to line

        # ensure line coordinates are ascending
        keys = list(lines.keys())
        keys.sort()
        return lines, keys

    # --------------------------------------------------------------------
    def compute_slots(keys, lines, right, left):
        """Compute "char resolution" for the page.

        The char width corresponding to 1 text char position on output - call
        it 'slot'.
        For each line, compute median of its char widths. The minimum across
        all "relevant" lines is our 'slot'.
        The minimum char width of each line is used to determine if spaces must
        be inserted in between two characters.
        """
        slot = used_width = right - left
        lineslots = {}
        for k in keys:
            lchars = lines[k]  # char tuples of line
            ccount = len(lchars)  # how many
            if ccount < 2:  # if short line, just put in something
                lineslots[k] = (1, 1, 1)
                continue
            widths = [c[3] for c in lchars]  # list of all char widths
            widths.sort()
            line_width = sum(widths)  # total width used by line
            i = int(ccount / 2 + 0.5)  # index of median
            median = widths[i]  # take the median value
            if (
                line_width / used_width >= 0.3 and median < slot
            ):  # if line is significant
                slot = median  # update global slot
            lineslots[k] = (widths[0], median, widths[-1])  # line slots
        return slot, lineslots

    # --------------------------------------------------------------------
    def joinligature(lig):
        """Return ligature character for a given pair / triple of characters.

        Args:
            lig: (str) 2/3 characters, e.g. "ff"
        Returns:
            Ligature, e.g. "ff" -> chr(0xFB00)
        """
        if lig == "ff":
            return chr(0xFB00)
        elif lig == "fi":
            return chr(0xFB01)
        elif lig == "fl":
            return chr(0xFB02)
        elif lig == "ft":
            return chr(0xFB05)
        elif lig == "st":
            return chr(0xFB06)
        elif lig == "ffi":
            return chr(0xFB03)
        elif lig == "ffl":
            return chr(0xFB04)
        return lig

    # --------------------------------------------------------------------
    def process_blocks(page, flags):
        left = page.rect.width  # left most used coordinate
        right = 0  # rightmost coordinate
        rowheight = page.rect.height  # smallest row height in use
        chars = []  # all chars here
        rows = set()  # bottom coordinates of lines
        blocks = page.get_text("rawdict", flags=flags)["blocks"]
        for block in blocks:
            for line in block["lines"]:
                if line["dir"] != (1, 0):  # ignore non-horizontal text
                    continue
                x0, y0, x1, y1 = line["bbox"]
                if y1 < 0 or y0 > page.rect.height:  # ignore if outside CropBox
                    continue
                # upd row height
                height = y1 - y0

                if rowheight > height:
                    rowheight = height
                for span in line["spans"]:
                    if span["size"] <= fontsize:
                        continue
                    for c in span["chars"]:
                        x0, _, x1, _ = c["bbox"]
                        cwidth = x1 - x0
                        ox, oy = c["origin"]
                        oy = int(round(oy))
                        rows.add(oy)
                        ch = c["c"]
                        if left > ox and ch != " ":
                            left = ox  # update left coordinate
                        if right < x1:
                            right = x1  # update right coordinate
                        # handle ligatures:
                        if cwidth == 0 and chars != []:  # potential ligature
                            old_ch, old_ox, old_oy, old_cwidth = chars[-1]
                            if old_oy == oy:  # ligature!
                                if old_ch != chr(0xFB00):  # previous "ff" char lig?
                                    lig = joinligature(old_ch + ch)  # 2-char
                                # convert to one of the 3-char ligatures:
                                elif ch == "i":
                                    lig = chr(0xFB03)  # "ffi"
                                elif ch == "l":
                                    lig = chr(0xFB04)  # "ffl"
                                else:  # something wrong, leave old char in place
                                    lig = old_ch
                                chars[-1] = (lig, old_ox, old_oy, old_cwidth)
                                continue
                        chars.append((ch, ox, oy, cwidth))  # all chars on page
        return rows, chars, rowheight, left, right

    # --------------------------------------------------------------------
    def make_textline(left, slot, lineslots, lchars):
        """Produce the text of one output line.

        Args:
            left: (float) left most coordinate used on page
            slot: (float) avg width of one character in any font in use.
            minslot: (float) min width for the characters in this line.
            chars: (list[tuple]) characters of this line.
        Returns:
            text: (str) text string for this line
        """
        minslot, median, maxslot = lineslots
        text = ""  # we output this
        old_x1 = 0  # end coordinate of last char
        old_ox = 0  # x-origin of last char
        if minslot <= fitz.EPSILON:
            raise RuntimeError("program error: minslot too small = %g" % minslot)

        for c in lchars:  # loop over characters
            char, ox, _, cwidth = c
            ox = ox - left  # its (relative) start coordinate
            x1 = ox + cwidth  # ending coordinate

            # eliminate overprint effect
            if (
                old_ox <= ox < old_x1
                and char == text[-1]
                and ox - old_ox <= cwidth * 0.2
            ):
                continue

            # omit spaces overlapping previous char
            if char == " " and (old_x1 - ox) / cwidth > 0.8:
                continue

            # close enough to previous?
            if ox < old_x1 + minslot:  # assume char adjacent to previous
                text += char  # append to output
                old_x1 = x1  # new end coord
                old_ox = ox  # new origin.x
                continue

            # else next char starts after some gap:
            # fill in right number of spaces, so char is positioned
            # in the right slot of the line
            delta = int(ox / slot) - len(text)
            if delta > 1 and ox <= old_x1 + slot * 2:
                delta = 1
            if ox > old_x1 and delta >= 1:
                text += " " * delta
            # now append char
            text += char
            old_x1 = x1  # new end coordinate
            old_ox = ox  # new origin
        return text.rstrip()

    # extract page text by single characters ("rawdict")
    rows, chars, rowheight, left, right = process_blocks(page, flags)
    if rows == set():
        if not skip_empty:
            textout.write(eop)  # write formfeed
        return
    # compute list of line coordinates - ignoring small (GRID) differences
    rows = curate_rows(rows, GRID)

    # sort all chars by x-coordinates, so every line will receive
    # them sorted.
    chars.sort(key=lambda c: c[1])

    # populate the lines with their char tuples
    lines, keys = make_lines(chars)

    slot, lineslots = compute_slots(keys, lines, right, left)

    # compute line advance in text output
    rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.5
    rowpos = rows[0]  # first line positioned here
    textout.write(b"\n")
    for k in keys:  # walk through the lines
        while rowpos < k:  # honor distance between lines
            textout.write(b"\n")
            rowpos += rowheight
        text = make_textline(left, slot, lineslots[k], lines[k])
        textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
        rowpos = k + rowheight

    textout.write(eop)  # write end-of-page


def gettext(args):
    doc = open_file(args.input, args.password, pdf=False)
    pagel = get_list(args.pages, doc.page_count + 1)
    output = args.output
    if output == None:
        filename, _ = os.path.splitext(doc.name)
        output = filename + ".txt"
    textout = open(output, "wb")
    flags = TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE
    if args.convert_white:
        flags ^= TEXT_PRESERVE_WHITESPACE
    if args.noligatures:
        flags ^= TEXT_PRESERVE_LIGATURES
    if args.extra_spaces:
        flags ^= TEXT_INHIBIT_SPACES
    func = {
        "simple": page_simple,
        "blocks": page_blocksort,
        "layout": page_layout,
    }
    for pno in pagel:
        page = doc[pno - 1]
        func[args.mode](
            page,
            textout,
            args.grid,
            args.fontsize,
            args.noformfeed,
            args.skip_empty,
            flags=flags,
        )

    textout.close()


def main():
    """Define command configurations."""
    parser = argparse.ArgumentParser(
        prog="fitz",
        description=mycenter("Basic PyMuPDF Functions"),
    )
    subps = parser.add_subparsers(
        title="Subcommands", help="Enter 'command -h' for subcommand specific help"
    )

    # -------------------------------------------------------------------------
    # 'show' command
    # -------------------------------------------------------------------------
    ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
    ps_show.add_argument("input", type=str, help="PDF filename")
    ps_show.add_argument("-password", help="password")
    ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
    ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
    ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
    ps_show.add_argument(
        "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
    )
    ps_show.add_argument(
        "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
    )
    ps_show.set_defaults(func=show)

    # -------------------------------------------------------------------------
    # 'clean' command
    # -------------------------------------------------------------------------
    ps_clean = subps.add_parser(
        "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
    )
    ps_clean.add_argument("input", type=str, help="PDF filename")
    ps_clean.add_argument("output", type=str, help="output PDF filename")
    ps_clean.add_argument("-password", help="password")

    ps_clean.add_argument(
        "-encryption",
        help="encryption method",
        choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
        default="none",
    )

    ps_clean.add_argument("-owner", type=str, help="owner password")
    ps_clean.add_argument("-user", type=str, help="user password")

    ps_clean.add_argument(
        "-garbage",
        type=int,
        help="garbage collection level",
        choices=range(5),
        default=0,
    )

    ps_clean.add_argument(
        "-compress",
        action="store_true",
        default=False,
        help="compress (deflate) output",
    )

    ps_clean.add_argument(
        "-ascii", action="store_true", default=False, help="ASCII encode binary data"
    )

    ps_clean.add_argument(
        "-linear",
        action="store_true",
        default=False,
        help="format for fast web display",
    )

    ps_clean.add_argument(
        "-permission", type=int, default=-1, help="integer with permission levels"
    )

    ps_clean.add_argument(
        "-sanitize",
        action="store_true",
        default=False,
        help="sanitize / clean contents",
    )
    ps_clean.add_argument(
        "-pretty", action="store_true", default=False, help="prettify PDF structure"
    )
    ps_clean.add_argument(
        "-pages", help="output selected pages pages, format: 1,5-7,50-N"
    )
    ps_clean.set_defaults(func=clean)

    # -------------------------------------------------------------------------
    # 'join' command
    # -------------------------------------------------------------------------
    ps_join = subps.add_parser(
        "join",
        description=mycenter("join PDF documents"),
        epilog="specify each input as 'filename[,password[,pages]]'",
    )
    ps_join.add_argument("input", nargs="*", help="input filenames")
    ps_join.add_argument("-output", required=True, help="output filename")
    ps_join.set_defaults(func=doc_join)

    # -------------------------------------------------------------------------
    # 'extract' command
    # -------------------------------------------------------------------------
    ps_extract = subps.add_parser(
        "extract", description=mycenter("extract images and fonts to disk")
    )
    ps_extract.add_argument("input", type=str, help="PDF filename")
    ps_extract.add_argument("-images", action="store_true", help="extract images")
    ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
    ps_extract.add_argument(
        "-output", help="folder to receive output, defaults to current"
    )
    ps_extract.add_argument("-password", help="password")
    ps_extract.add_argument(
        "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
    )
    ps_extract.set_defaults(func=extract_objects)

    # -------------------------------------------------------------------------
    # 'embed-info'
    # -------------------------------------------------------------------------
    ps_show = subps.add_parser(
        "embed-info", description=mycenter("list embedded files")
    )
    ps_show.add_argument("input", help="PDF filename")
    ps_show.add_argument("-name", help="if given, report only this one")
    ps_show.add_argument("-detail", action="store_true", help="detail information")
    ps_show.add_argument("-password", help="password")
    ps_show.set_defaults(func=embedded_list)

    # -------------------------------------------------------------------------
    # 'embed-add' command
    # -------------------------------------------------------------------------
    ps_embed_add = subps.add_parser(
        "embed-add", description=mycenter("add embedded file")
    )
    ps_embed_add.add_argument("input", help="PDF filename")
    ps_embed_add.add_argument("-password", help="password")
    ps_embed_add.add_argument(
        "-output", help="output PDF filename, incremental save if none"
    )
    ps_embed_add.add_argument("-name", required=True, help="name of new entry")
    ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
    ps_embed_add.add_argument("-desc", help="description of new entry")
    ps_embed_add.set_defaults(func=embedded_add)

    # -------------------------------------------------------------------------
    # 'embed-del' command
    # -------------------------------------------------------------------------
    ps_embed_del = subps.add_parser(
        "embed-del", description=mycenter("delete embedded file")
    )
    ps_embed_del.add_argument("input", help="PDF filename")
    ps_embed_del.add_argument("-password", help="password")
    ps_embed_del.add_argument(
        "-output", help="output PDF filename, incremental save if none"
    )
    ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
    ps_embed_del.set_defaults(func=embedded_del)

    # -------------------------------------------------------------------------
    # 'embed-upd' command
    # -------------------------------------------------------------------------
    ps_embed_upd = subps.add_parser(
        "embed-upd",
        description=mycenter("update embedded file"),
        epilog="except '-name' all parameters are optional",
    )
    ps_embed_upd.add_argument("input", help="PDF filename")
    ps_embed_upd.add_argument("-name", required=True, help="name of entry")
    ps_embed_upd.add_argument("-password", help="password")
    ps_embed_upd.add_argument(
        "-output", help="Output PDF filename, incremental save if none"
    )
    ps_embed_upd.add_argument("-path", help="path to new data for entry")
    ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
    ps_embed_upd.add_argument(
        "-ufilename", help="new unicode filename to store in entry"
    )
    ps_embed_upd.add_argument("-desc", help="new description to store in entry")
    ps_embed_upd.set_defaults(func=embedded_upd)

    # -------------------------------------------------------------------------
    # 'embed-extract' command
    # -------------------------------------------------------------------------
    ps_embed_extract = subps.add_parser(
        "embed-extract", description=mycenter("extract embedded file to disk")
    )
    ps_embed_extract.add_argument("input", type=str, help="PDF filename")
    ps_embed_extract.add_argument("-name", required=True, help="name of entry")
    ps_embed_extract.add_argument("-password", help="password")
    ps_embed_extract.add_argument(
        "-output", help="output filename, default is stored name"
    )
    ps_embed_extract.set_defaults(func=embedded_get)

    # -------------------------------------------------------------------------
    # 'embed-copy' command
    # -------------------------------------------------------------------------
    ps_embed_copy = subps.add_parser(
        "embed-copy", description=mycenter("copy embedded files between PDFs")
    )
    ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
    ps_embed_copy.add_argument("-password", help="password of input")
    ps_embed_copy.add_argument(
        "-output", help="output PDF, incremental save to 'input' if omitted"
    )
    ps_embed_copy.add_argument(
        "-source", required=True, help="copy embedded files from here"
    )
    ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
    ps_embed_copy.add_argument(
        "-name", nargs="*", help="restrict copy to these entries"
    )
    ps_embed_copy.set_defaults(func=embedded_copy)

    # -------------------------------------------------------------------------
    # 'textlayout' command
    # -------------------------------------------------------------------------
    ps_gettext = subps.add_parser(
        "gettext", description=mycenter("extract text in various formatting modes")
    )
    ps_gettext.add_argument("input", type=str, help="input document filename")
    ps_gettext.add_argument("-password", help="password for input document")
    ps_gettext.add_argument(
        "-mode",
        type=str,
        help="mode: simple, block sort, or layout (default)",
        choices=("simple", "blocks", "layout"),
        default="layout",
    )
    ps_gettext.add_argument(
        "-pages",
        type=str,
        help="select pages, format: 1,5-7,50-N",
        default="1-N",
    )
    ps_gettext.add_argument(
        "-noligatures",
        action="store_true",
        help="expand ligature characters (default False)",
        default=False,
    )
    ps_gettext.add_argument(
        "-convert-white",
        action="store_true",
        help="convert whitespace characters to space (default False)",
        default=False,
    )
    ps_gettext.add_argument(
        "-extra-spaces",
        action="store_true",
        help="fill gaps with spaces (default False)",
        default=False,
    )
    ps_gettext.add_argument(
        "-noformfeed",
        action="store_true",
        help="write linefeeds, no formfeeds (default False)",
        default=False,
    )
    ps_gettext.add_argument(
        "-skip-empty",
        action="store_true",
        help="suppress pages with no text (default False)",
        default=False,
    )
    ps_gettext.add_argument(
        "-output",
        help="store text in this file (default inputfilename.txt)",
    )
    ps_gettext.add_argument(
        "-grid",
        type=float,
        help="merge lines if closer than this (default 2)",
        default=2,
    )
    ps_gettext.add_argument(
        "-fontsize",
        type=float,
        help="only include text with a larger fontsize (default 3)",
        default=3,
    )
    ps_gettext.set_defaults(func=gettext)

    # -------------------------------------------------------------------------
    # start program
    # -------------------------------------------------------------------------
    args = parser.parse_args()  # create parameter arguments class
    if not hasattr(args, "func"):  # no function selected
        parser.print_help()  # so print top level help
    else:
        args.func(args)  # execute requested command


if __name__ == "__main__":
    main()