Skip to content

Commit

Permalink
Table padding option (#130)
Browse files Browse the repository at this point in the history
Add Table padding option
  • Loading branch information
theSage21 authored and Alir3z4 committed May 29, 2016
1 parent 3950277 commit df1d723
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 4 deletions.
1 change: 1 addition & 0 deletions ChangeLog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ xxxx.x.x
=========
----

* Fix #125: --pad_tables now pads table cells to make them look nice.
* Fix #114: Break does not interrupt blockquotes
* Deprecation warnings for URL retreival

Expand Down
1 change: 1 addition & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,4 @@ Command line options
| `--mark-code` | Mark code with [code]...[/code] blocks
| `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links`
| `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc.
| `--pad-tables` | Use padding to make tables look good.
22 changes: 18 additions & 4 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
list_numbering_start,
dumb_css_parser,
escape_md_section,
skipwrap
skipwrap,
pad_tables_in_text
)

__version__ = (2016, 4, 2)
Expand Down Expand Up @@ -77,6 +78,7 @@ def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
self.hide_strikethrough = False # covered in cli
self.mark_code = config.MARK_CODE
self.wrap_links = config.WRAP_LINKS # covered in cli
self.pad_tables = config.PAD_TABLES # covered in cli
self.tag_callback = None

if out is None: # pragma: no cover
Expand Down Expand Up @@ -130,7 +132,11 @@ def feed(self, data):
def handle(self, data):
self.feed(data)
self.feed("")
return self.optwrap(self.close())
markdown = self.optwrap(self.close())
if self.pad_tables:
return pad_tables_in_text(markdown)
else:
return markdown

def outtextf(self, s):
self.outtextlist.append(s)
Expand Down Expand Up @@ -556,8 +562,16 @@ def handle_tag(self, tag, attrs, start):
self.o('</{0}>'.format(tag))

else:
if tag == "table" and start:
self.table_start = True
if tag == "table":
if start:
self.table_start = True
if self.pad_tables:
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
else:
if self.pad_tables:
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
if tag in ["td", "th"] and start:
if self.split_next_td:
self.o("| ")
Expand Down
8 changes: 8 additions & 0 deletions html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ class bcolors: # pragma: no cover
'%prog [(filename|url) [encoding]]',
version='%prog ' + ".".join(map(str, __version__))
)
p.add_option(
"--pad-tables",
dest="pad_tables",
action="store_true",
default=config.PAD_TABLES,
help="pad the cells to equal column width in tables"
)
p.add_option(
"--no-wrap-links",
dest="wrap_links",
Expand Down Expand Up @@ -271,5 +278,6 @@ class bcolors: # pragma: no cover
h.links_each_paragraph = options.links_each_paragraph
h.mark_code = options.mark_code
h.wrap_links = options.wrap_links
h.pad_tables = options.pad_tables

wrapwrite(h.handle(data))
3 changes: 3 additions & 0 deletions html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0

# Marker to use for marking tables for padding post processing
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
# Escape all special characters. Output is less readable, but avoids
# corner case formatting issues.
ESCAPE_SNOB = 0
Expand Down Expand Up @@ -36,6 +38,7 @@
IGNORE_EMPHASIS = False
MARK_CODE = False
DECODE_ERRORS = 'strict'
PAD_TABLES = False

# Convert links with same href and text to <href> format if they are absolute links
USE_AUTOMATIC_LINKS = True
Expand Down
52 changes: 52 additions & 0 deletions html2text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,55 @@ def escape_md_section(text, snob=False):
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)

return text

def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]

# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
if set(line.strip()) == set('-|'):
filler = '-'
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
else:
filler = ' '
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
new_lines.append('|'.join(new_cols))
return new_lines

def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
new_lines = []
for line in lines:
# Toogle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append('')
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
new_text = '\n'.join(new_lines)
return new_text
26 changes: 26 additions & 0 deletions test/pad_table.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<!DOCTYPE html> <html>
<head lang="en"> <meta charset="UTF-8"> <title></title> </head>
<body> <h1>This is a test document</h1> With some text, <code>code</code>, <b>bolds</b> and <i>italics</i>. <h2>This is second header</h2> <p style="display: none">Displaynone text</p>
<table>
<tr> <th>Header 1</th> <th>Header 2</th> <th>Header 3</th> </tr>
<tr> <td>Content 1</td> <td>2</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
<tr> <td>Content 1 longer</td> <td>Content 2</td> <td>blah</td> </tr>
<tr> <td>Content </td> <td>Content 2</td> <td>blah</td> </tr>
<tr> <td>t </td> <td>Content 2</td> <td>blah blah blah</td> </tr>
</table>


<table> <tr> <th>H1</th> <th>H2</th> <th>H3</th> </tr>
<tr> <td>C1</td> <td>Content 2</td> <td>x</td> </tr>
<tr> <td>C123</td> <td>Content 2</td> <td>xyz</td> </tr>
</table>

some content between the tables<br>

<table> <tr> <th>Header 1</th> <th>Header 2</th> <th>Header 3</th> </tr>
<tr> <td>Content 1</td> <td>Content 2</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
<tr> <td>Content 1</td> <td>Content 2 longer</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
</table>

something else entirely
</body> </html>
28 changes: 28 additions & 0 deletions test/pad_table.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# This is a test document

With some text, `code`, **bolds** and _italics_.

## This is second header

Displaynone text

Header 1 | Header 2 | Header 3
-----------------|-----------|----------------------------------------------
Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image!
Content 1 longer | Content 2 | blah
Content | Content 2 | blah
t | Content 2 | blah blah blah

H1 | H2 | H3
-----|-----------|-----
C1 | Content 2 | x
C123 | Content 2 | xyz

some content between the tables
Header 1 | Header 2 | Header 3
----------|------------------|----------------------------------------------
Content 1 | Content 2 | ![200](http://lorempixel.com/200/200) Image!
Content 1 | Content 2 longer | ![200](http://lorempixel.com/200/200) Image!

something else entirely

4 changes: 4 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ def test_func(self):
module_args['mark_code'] = True
cmdline_args.append('--mark-code')

if base_fn.startswith('pad_table'):
module_args['pad_tables'] = True
cmdline_args.append('--pad-tables')

if base_fn not in ['bodywidth_newline.html', 'abbr_tag.html']:
test_func = None

Expand Down

0 comments on commit df1d723

Please sign in to comment.