scrapinghub · tcurvelo · Oct 20, 2019 · Oct 24, 2019 · Oct 25, 2019 · Nov 4, 2019
diff --git a/src/arche/__init__.py b/src/arche/__init__.py
@@ -5,11 +5,16 @@
 SH_URL = "https://app.scrapinghub.com/p"  # noqa
 
 from arche.arche import Arche
+from arche.tools import dataframe
 from arche.tools.schema import basic_json_schema
 import numpy as np
 import pandas as pd
 import plotly.io as pio
 
+pd.DataFrame._repr_html_ = dataframe._repr_html_
+pd.set_option("display.max_colwidth", -1)
+pd.set_option("display.render_links", True)
+
 pio.renderers.default = "notebook_connected+jupyterlab"
 
 __all__ = ["basic_json_schema", "Arche", "np", "pd"]

diff --git a/src/arche/tools/dataframe.py b/src/arche/tools/dataframe.py
@@ -0,0 +1,65 @@
+from io import StringIO
+from typing import Optional
+
+from pandas._config import get_option, config
+from pandas.io.formats import format as fmt
+
+
+pc_render_links_doc = """
+: bool
+    This sets if URLs in DataFrame should be rendered as clickable anchors.
+"""
+
+# Register `render_links` option
+with config.config_prefix("display"):
+    config.register_option(
+        "render_links", True, pc_render_links_doc, validator=config.is_bool
+    )
+
+
+def _repr_html_(self) -> Optional[str]:
+    """
+    Return a html representation for a particular DataFrame.
+    Mainly for IPython notebook.
+    """
+    if self._info_repr():
+        buf = StringIO("")
+        self.info(buf=buf)
+        # need to escape the <class>, should be the first line.
+        val = buf.getvalue().replace("<", r"&lt;", 1)
+        val = val.replace(">", r"&gt;", 1)
+        return "<pre>" + val + "</pre>"
+
+    if get_option("display.notebook_repr_html"):
+        max_rows = get_option("display.max_rows")
+        min_rows = get_option("display.min_rows")
+        max_cols = get_option("display.max_columns")
+        show_dimensions = get_option("display.show_dimensions")
+        render_links = get_option("display.render_links")
+
+        formatter = fmt.DataFrameFormatter(
+            self,
+            columns=None,
+            col_space=None,
+            na_rep="NaN",
+            formatters=None,
+            float_format=None,
+            sparsify=None,
+            justify=None,
+            index_names=True,
+            header=True,
+            index=True,
+            bold_rows=True,
+            escape=True,
+            max_rows=max_rows,
+            min_rows=min_rows,
+            max_cols=max_cols,
+            show_dimensions=show_dimensions,
+            decimal=".",
+            table_id=None,
+            render_links=render_links,
+        )
+        formatter.to_html(notebook=True)
+        return formatter.buf.getvalue()
+    else:
+        return None
diff --git a/tests/tools/test_dataframe_html_output.py b/tests/tools/test_dataframe_html_output.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import pytest
+
+
+@pytest.fixture()
+def df_with_urls():
+    pd.set_option("display.notebook_repr_html", True)
+    data = {"col1": [1, 2], "col2": ["http://foo.com", "https://bar.com"]}
+    return pd.DataFrame(data)
+
+
+def test_df_has_clickable_urls(df_with_urls):
+    html = df_with_urls._repr_html_()
+
+    assert '<a href="http://foo.com" target="_blank">http://foo.com</a>' in html
+    assert '<a href="https://bar.com" target="_blank">https://bar.com</a>' in html
+
+
+def test_derivaded_df_has_clickable_urls(df_with_urls):
+    html = df_with_urls.head()._repr_html_()
+    assert '<a href="http://foo.com" target="_blank">http://foo.com</a>' in html
+    assert '<a href="https://bar.com" target="_blank">https://bar.com</a>' in html
+
+
+def test_arche_df_does_not_add_links_if_no_url_found():
+    df = pd.DataFrame({"col1": [1, 2], "col2": ["foo", "bar"]})
+    html = df._repr_html_()
+    assert "<a href=" not in html
+
+
+def test_large_repr(df_with_urls):
+    df_with_urls._info_repr = lambda: True
+    html = df_with_urls._repr_html_()
+
+    assert "<pre>&lt;class 'pandas.core.frame.DataFrame'&gt;\n" in html
+    assert "<a href=" not in html